diff --git a/Grid/Benchmark_comms_host_device.cpp b/Grid/Benchmark_comms_host_device.cpp
deleted file mode 100644
index e213859..0000000
--- a/Grid/Benchmark_comms_host_device.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
-Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
-Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-struct time_statistics
-{
-  double mean;
-  double err;
-  double min;
-  double max;
-
-  void statistics(std::vector<double> v)
-  {
-    double sum = std::accumulate(v.begin(), v.end(), 0.0);
-    mean = sum / v.size();
-
-    std::vector<double> diff(v.size());
-    std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
-    double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
-    err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
-
-    auto result = std::minmax_element(v.begin(), v.end());
-    min = *result.first;
-    max = *result.second;
-  }
-};
-
-void header()
-{
-  std::cout << GridLogMessage << " L  "
-            << "\t"
-            << " Ls  "
-            << "\t" << std::setw(11) << "bytes\t\t"
-            << "MB/s uni"
-            << "\t"
-            << "MB/s bidi" << std::endl;
-};
-
-int main(int argc, char **argv)
-{
-  Grid_init(&argc, &argv);
-
-  Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
-  Coordinate mpi_layout = GridDefaultMpi();
-  int threads = GridThread::GetThreads();
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
-            << std::endl;
-
-  int Nloop = 250;
-  int nmu = 0;
-  int maxlat = 32;
-  for (int mu = 0; mu < Nd; mu++)
-    if (mpi_layout[mu] > 1)
-      nmu++;
-
-  std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
-            << std::endl;
-  std::vector<double> t_time(Nloop);
-  //  time_statistics timestat;
-
-  std::cout << GridLogMessage
-            << "========================================================================="
-               "==========================="
-            << std::endl;
-  std::cout << GridLogMessage
-            << "= Benchmarking sequential halo exchange from host memory " << std::endl;
-  std::cout << GridLogMessage
-            << "========================================================================="
-               "==========================="
-            << std::endl;
-  header();
-
-  for (int lat = 8; lat <= maxlat; lat += 4)
-  {
-    for (int Ls = 8; Ls <= 8; Ls *= 2)
-    {
-
-      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
-                            lat * mpi_layout[3]});
-
-      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank / Nnode;
-
-      std::vector<std::vector<HalfSpinColourVectorD>> xbuf(8);
-      std::vector<std::vector<HalfSpinColourVectorD>> rbuf(8);
-
-      for (int mu = 0; mu < 8; mu++)
-      {
-        xbuf[mu].resize(lat * lat * lat * Ls);
-        rbuf[mu].resize(lat * lat * lat * Ls);
-      }
-      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
-
-      int ncomm;
-
-      for (int mu = 0; mu < 4; mu++)
-      {
-        if (mpi_layout[mu] > 1)
-        {
-          double start = usecond();
-          for (int i = 0; i < Nloop; i++)
-          {
-
-            ncomm = 0;
-
-            ncomm++;
-            int comm_proc = 1;
-            int xmit_to_rank;
-            int recv_from_rank;
-
-            {
-              std::vector<CommsRequest_t> requests;
-              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
-                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
-            }
-
-            comm_proc = mpi_layout[mu] - 1;
-            {
-              std::vector<CommsRequest_t> requests;
-              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
-                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
-            }
-          }
-          Grid.Barrier();
-          double stop = usecond();
-          double mean = (stop - start) / Nloop;
-          double dbytes = bytes * ppn;
-          double xbytes = dbytes * 2.0 * ncomm;
-          double rbytes = xbytes;
-          double bidibytes = xbytes + rbytes;
-
-          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
-                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
-                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
-                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
-        }
-      }
-    }
-  }
-
-  std::cout << GridLogMessage
-            << "========================================================================="
-               "==========================="
-            << std::endl;
-  std::cout << GridLogMessage
-            << "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
-  std::cout << GridLogMessage
-            << "========================================================================="
-               "==========================="
-            << std::endl;
-  header();
-
-  for (int lat = 8; lat <= maxlat; lat += 4)
-  {
-    for (int Ls = 8; Ls <= 8; Ls *= 2)
-    {
-
-      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
-                            lat * mpi_layout[3]});
-
-      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank / Nnode;
-
-      std::vector<HalfSpinColourVectorD *> xbuf(8);
-      std::vector<HalfSpinColourVectorD *> rbuf(8);
-
-      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
-      for (int d = 0; d < 8; d++)
-      {
-        xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-        rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-      }
-
-      int ncomm;
-
-      for (int mu = 0; mu < 4; mu++)
-      {
-        if (mpi_layout[mu] > 1)
-        {
-          double start = usecond();
-          for (int i = 0; i < Nloop; i++)
-          {
-
-            ncomm = 0;
-
-            ncomm++;
-            int comm_proc = 1;
-            int xmit_to_rank;
-            int recv_from_rank;
-
-            {
-              std::vector<CommsRequest_t> requests;
-              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
-                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
-            }
-
-            comm_proc = mpi_layout[mu] - 1;
-            {
-              std::vector<CommsRequest_t> requests;
-              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
-                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
-            }
-          }
-          Grid.Barrier();
-          double stop = usecond();
-          double mean = (stop - start) / Nloop;
-          double dbytes = bytes * ppn;
-          double xbytes = dbytes * 2.0 * ncomm;
-          double rbytes = xbytes;
-          double bidibytes = xbytes + rbytes;
-
-          std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
-                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
-                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
-                    << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
-        }
-      }
-
-      for (int d = 0; d < 8; d++)
-      {
-        acceleratorFreeDevice(xbuf[d]);
-        acceleratorFreeDevice(rbuf[d]);
-      }
-    }
-  }
-
-  std::cout << GridLogMessage
-            << "========================================================================="
-               "==========================="
-            << std::endl;
-  std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
-  std::cout << GridLogMessage
-            << "========================================================================="
-               "==========================="
-            << std::endl;
-
-  Grid_finalize();
-}
diff --git a/Grid/Benchmark_dwf_fp32.cpp b/Grid/Benchmark_dwf_fp32.cpp
deleted file mode 100644
index c0fcf7c..0000000
--- a/Grid/Benchmark_dwf_fp32.cpp
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
-Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
-Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
-Copyright © 2023 Simon Bürger <simon.buerger@rwth-aachen.de>
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "json.hpp"
-#include <Grid/Grid.h>
-#ifdef GRID_CUDA
-#define CUDA_PROFILE
-#endif
-
-#ifdef CUDA_PROFILE
-#include <cuda_profiler_api.h>
-#endif
-
-using namespace std;
-using namespace Grid;
-
-template <class d> struct scal
-{
-  d internal;
-};
-
-Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
-                        Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
-
-int main(int argc, char **argv)
-{
-  Grid_init(&argc, &argv);
-
-  int threads = GridThread::GetThreads();
-
-  Coordinate latt4 = GridDefaultLatt();
-  int Ls = 16;
-  std::string json_filename = ""; // empty indicates no json output
-  nlohmann::json json;
-
-  // benchmark specific command line arguments
-  for (int i = 0; i < argc; i++)
-  {
-    if (std::string(argv[i]) == "-Ls")
-    {
-      std::stringstream ss(argv[i + 1]);
-      ss >> Ls;
-    }
-    if (std::string(argv[i]) == "--json-out")
-      json_filename = argv[i + 1];
-  }
-
-  GridLogLayout();
-
-  long unsigned int single_site_flops = 8 * Nc * (7 + 16 * Nc);
-
-  json["single_site_flops"] = single_site_flops;
-
-  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
-      GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
-  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-
-  GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
-  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
-
-  json["grid"] = FGrid->FullDimensions().toVector();
-  json["local_grid"] = FGrid->LocalDimensions().toVector();
-
-  std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
-  GridCartesian *sUGrid =
-      SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
-
-  GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-  GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
-  GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
-
-  std::vector<int> seeds4({1, 2, 3, 4});
-  std::vector<int> seeds5({5, 6, 7, 8});
-
-  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
-  GridParallelRNG RNG4(UGrid);
-  RNG4.SeedUniqueString(std::string("The 4D RNG"));
-  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
-  GridParallelRNG RNG5(FGrid);
-  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-  LatticeFermionF src(FGrid);
-  random(RNG5, src);
-#if 0
-  src = Zero();
-  {
-    Coordinate origin({0,0,0,latt4[2]-1,0});
-    SpinColourVectorF tmp;
-    tmp=Zero();
-    tmp()(0)(0)=Complex(-2.0,0.0);
-    std::cout << " source site 0 " << tmp<<std::endl;
-    pokeSite(tmp,src,origin);
-  }
-#else
-  RealD N2 = 1.0 / ::sqrt(norm2(src));
-  src = src * N2;
-#endif
-
-  LatticeFermionF result(FGrid);
-  result = Zero();
-  LatticeFermionF ref(FGrid);
-  ref = Zero();
-  LatticeFermionF tmp(FGrid);
-  LatticeFermionF err(FGrid);
-
-  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  LatticeGaugeFieldF Umu(UGrid);
-  SU<Nc>::HotConfiguration(RNG4, Umu);
-  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-#if 0
-  Umu=1.0;
-  for(int mu=0;mu<Nd;mu++){
-    LatticeColourMatrixF ttmp(UGrid);
-    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
-    //    if (mu !=2 ) ttmp = 0;
-    //    ttmp = ttmp* pow(10.0,mu);
-    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
-  }
-  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
-#endif
-
-  ////////////////////////////////////
-  // Naive wilson implementation
-  ////////////////////////////////////
-  // replicate across fifth dimension
-  //  LatticeGaugeFieldF Umu5d(FGrid);
-  std::vector<LatticeColourMatrixF> U(4, UGrid);
-  for (int mu = 0; mu < Nd; mu++)
-  {
-    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-  }
-  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
-
-  if (1)
-  {
-    ref = Zero();
-    for (int mu = 0; mu < Nd; mu++)
-    {
-
-      tmp = Cshift(src, mu + 1, 1);
-      {
-        autoView(tmp_v, tmp, CpuWrite);
-        autoView(U_v, U[mu], CpuRead);
-        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
-        {
-          for (int s = 0; s < Ls; s++)
-          {
-            tmp_v[Ls * ss + s] = U_v[ss] * tmp_v[Ls * ss + s];
-          }
-        }
-      }
-      ref = ref + tmp - Gamma(Gmu[mu]) * tmp;
-
-      {
-        autoView(tmp_v, tmp, CpuWrite);
-        autoView(U_v, U[mu], CpuRead);
-        autoView(src_v, src, CpuRead);
-        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
-        {
-          for (int s = 0; s < Ls; s++)
-          {
-            tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
-          }
-        }
-      }
-      tmp = Cshift(tmp, mu + 1, -1);
-      ref = ref + tmp + Gamma(Gmu[mu]) * tmp;
-    }
-    ref = -0.5 * ref;
-  }
-
-  RealD mass = 0.1;
-  RealD M5 = 1.8;
-
-  RealD NP = UGrid->_Nprocessors;
-  RealD NN = UGrid->NodeCount();
-
-  json["ranks"] = NP;
-  json["nodes"] = NN;
-
-  std::cout << GridLogMessage
-            << "*****************************************************************"
-            << std::endl;
-  std::cout << GridLogMessage
-            << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
-            << std::endl;
-  std::cout << GridLogMessage
-            << "*****************************************************************"
-            << std::endl;
-  std::cout << GridLogMessage
-            << "*****************************************************************"
-            << std::endl;
-  std::cout << GridLogMessage
-            << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
-  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
-            << std::endl;
-  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
-            << std::endl;
-
-  if (sizeof(RealF) == 4)
-    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
-  if (sizeof(RealF) == 8)
-    std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
-#ifdef GRID_OMP
-  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
-    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
-  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
-    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
-#endif
-  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
-    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
-  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
-    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
-  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
-
-    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
-  std::cout << GridLogMessage
-            << "*****************************************************************"
-            << std::endl;
-
-  DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
-  int ncall = 300;
-
-  if (1)
-  {
-    FGrid->Barrier();
-    Dw.ZeroCounters();
-    Dw.Dhop(src, result, 0);
-    std::cout << GridLogMessage << "Called warmup" << std::endl;
-    double t0 = usecond();
-    for (int i = 0; i < ncall; i++)
-    {
-      __SSC_START;
-      Dw.Dhop(src, result, 0);
-      __SSC_STOP;
-    }
-    double t1 = usecond();
-    FGrid->Barrier();
-
-    double volume = Ls;
-    for (int mu = 0; mu < Nd; mu++)
-      volume = volume * latt4[mu];
-    double flops = single_site_flops * volume * ncall;
-
-    auto nsimd = vComplex::Nsimd();
-    auto simdwidth = sizeof(vComplex);
-
-    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
-    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
-                     nsimd * ncall / (1024. * 1024. * 1024.);
-
-    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
-    double data_mem =
-        (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
-        nsimd * ncall / (1024. * 1024. * 1024.);
-
-    json["Dw"]["calls"] = ncall;
-    json["Dw"]["time"] = t1 - t0;
-    json["Dw"]["mflops"] = flops / (t1 - t0);
-    json["Dw"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
-    json["Dw"]["mflops_per_node"] = flops / (t1 - t0) / NN;
-    json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
-    json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
-
-    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
-              << " us" << std::endl;
-    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
-    std::cout << GridLogMessage << "mflop/s =   " << flops / (t1 - t0) << std::endl;
-    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP
-              << std::endl;
-    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN
-              << std::endl;
-
-    std::cout << GridLogMessage
-              << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0))
-              << std::endl;
-    std::cout << GridLogMessage
-              << "mem GiB/s (base 2) =   " << 1000000. * data_mem / ((t1 - t0))
-              << std::endl;
-    err = ref - result;
-    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
-    // exit(0);
-
-    if ((norm2(err) > 1.0e-4))
-    {
-      /*
-      std::cout << "RESULT\n " << result<<std::endl;
-      std::cout << "REF   \n " << ref   <<std::endl;
-      std::cout << "ERR   \n " << err   <<std::endl;
-      */
-      std::cout << GridLogMessage << "WRONG RESULT" << std::endl;
-      FGrid->Barrier();
-      exit(-1);
-    }
-    assert(norm2(err) < 1.0e-4);
-    Dw.Report();
-  }
-
-  if (1)
-  { // Naive wilson dag implementation
-    ref = Zero();
-    for (int mu = 0; mu < Nd; mu++)
-    {
-
-      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = Cshift(src, mu + 1, 1);
-      {
-        autoView(ref_v, ref, CpuWrite);
-        autoView(tmp_v, tmp, CpuRead);
-        autoView(U_v, U[mu], CpuRead);
-        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
-        {
-          for (int s = 0; s < Ls; s++)
-          {
-            int i = s + Ls * ss;
-            ref_v[i] += U_v[ss] * (tmp_v[i] + Gamma(Gmu[mu]) * tmp_v[i]);
-            ;
-          }
-        }
-      }
-
-      {
-        autoView(tmp_v, tmp, CpuWrite);
-        autoView(U_v, U[mu], CpuRead);
-        autoView(src_v, src, CpuRead);
-        for (int ss = 0; ss < U[mu].Grid()->oSites(); ss++)
-        {
-          for (int s = 0; s < Ls; s++)
-          {
-            tmp_v[Ls * ss + s] = adj(U_v[ss]) * src_v[Ls * ss + s];
-          }
-        }
-      }
-      //      tmp =adj(U[mu])*src;
-      tmp = Cshift(tmp, mu + 1, -1);
-      {
-        autoView(ref_v, ref, CpuWrite);
-        autoView(tmp_v, tmp, CpuRead);
-        for (int i = 0; i < ref_v.size(); i++)
-        {
-          ref_v[i] += tmp_v[i] - Gamma(Gmu[mu]) * tmp_v[i];
-          ;
-        }
-      }
-    }
-    ref = -0.5 * ref;
-  }
-  //  dump=1;
-  Dw.Dhop(src, result, 1);
-
-  std::cout << GridLogMessage
-            << "Compare to naive wilson implementation Dag to verify correctness"
-            << std::endl;
-  std::cout << GridLogMessage << "Called DwDag" << std::endl;
-  std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
-  std::cout << GridLogMessage << "norm dag ref    " << norm2(ref) << std::endl;
-  err = ref - result;
-  std::cout << GridLogMessage << "norm dag diff   " << norm2(err) << std::endl;
-  if ((norm2(err) > 1.0e-4))
-  {
-    /*
-      std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
-      std::cout<< "DAG sRESULT\n " <<result  << std::endl;
-      std::cout<< "DAG ERR   \n "  << err    <<std::endl;
-    */
-  }
-  LatticeFermionF src_e(FrbGrid);
-  LatticeFermionF src_o(FrbGrid);
-  LatticeFermionF r_e(FrbGrid);
-  LatticeFermionF r_o(FrbGrid);
-  LatticeFermionF r_eo(FGrid);
-
-  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
-            << std::endl;
-  pickCheckerboard(Even, src_e, src);
-  pickCheckerboard(Odd, src_o, src);
-
-  std::cout << GridLogMessage << "src_e" << norm2(src_e) << std::endl;
-  std::cout << GridLogMessage << "src_o" << norm2(src_o) << std::endl;
-
-  // S-direction is INNERMOST and takes no part in the parity.
-  std::cout << GridLogMessage
-
-            << "*********************************************************" << std::endl;
-  std::cout << GridLogMessage
-            << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
-  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
-            << std::endl;
-
-  if (sizeof(RealF) == 4)
-    std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
-  if (sizeof(RealF) == 8)
-    std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
-#ifdef GRID_OMP
-  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
-    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
-  if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
-    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
-#endif
-  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
-    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
-  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
-    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
-  if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
-    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
-  std::cout << GridLogMessage
-            << "*********************************************************" << std::endl;
-
-  {
-    Dw.ZeroCounters();
-    FGrid->Barrier();
-    Dw.DhopEO(src_o, r_e, DaggerNo);
-    double t0 = usecond();
-    for (int i = 0; i < ncall; i++)
-    {
-#ifdef CUDA_PROFILE
-      if (i == 10)
-        cudaProfilerStart();
-#endif
-      Dw.DhopEO(src_o, r_e, DaggerNo);
-#ifdef CUDA_PROFILE
-      if (i == 20)
-        cudaProfilerStop();
-#endif
-    }
-    double t1 = usecond();
-    FGrid->Barrier();
-
-    double volume = Ls;
-    for (int mu = 0; mu < Nd; mu++)
-      volume = volume * latt4[mu];
-    double flops = (single_site_flops * volume * ncall) / 2.0;
-
-    json["Deo"]["calls"] = ncall;
-    json["Deo"]["time"] = t1 - t0;
-    json["Deo"]["mflops"] = flops / (t1 - t0);
-    json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
-    json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
-
-    std::cout << GridLogMessage << "Deo mflop/s =   " << flops / (t1 - t0) << std::endl;
-    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP
-              << std::endl;
-    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN
-              << std::endl;
-
-    Dw.Report();
-  }
-  Dw.DhopEO(src_o, r_e, DaggerNo);
-  Dw.DhopOE(src_e, r_o, DaggerNo);
-  Dw.Dhop(src, result, DaggerNo);
-
-  std::cout << GridLogMessage << "r_e" << norm2(r_e) << std::endl;
-  std::cout << GridLogMessage << "r_o" << norm2(r_o) << std::endl;
-  std::cout << GridLogMessage << "res" << norm2(result) << std::endl;
-
-  setCheckerboard(r_eo, r_o);
-  setCheckerboard(r_eo, r_e);
-
-  err = r_eo - result;
-  std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
-  if ((norm2(err) > 1.0e-4))
-  {
-    /*
-  std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
-  std::cout<< "Deo REF\n " <<result  << std::endl;
-  std::cout<< "Deo ERR   \n " << err <<std::endl;
-    */
-  }
-
-  pickCheckerboard(Even, src_e, err);
-  pickCheckerboard(Odd, src_o, err);
-  std::cout << GridLogMessage << "norm diff even  " << norm2(src_e) << std::endl;
-  std::cout << GridLogMessage << "norm diff odd   " << norm2(src_o) << std::endl;
-
-  assert(norm2(src_e) < 1.0e-4);
-  assert(norm2(src_o) < 1.0e-4);
-
-  if (!json_filename.empty())
-  {
-    std::cout << GridLogMessage << "writing benchmark results to " << json_filename
-              << std::endl;
-
-    int me = 0;
-    MPI_Comm_rank(MPI_COMM_WORLD, &me);
-    if (me == 0)
-    {
-      std::ofstream json_file(json_filename);
-      json_file << std::setw(4) << json;
-    }
-  }
-
-  Grid_finalize();
-  exit(0);
-}