fixed conflicts after merging pabyle develop

2026-06-22 19:53:17 +01:00 · 2023-07-03 11:46:37 -04:00
parent cb277ae516 ef8af7bff8
commit 2100cc6497
408 changed files with 28450 additions and 4699 deletions
@@ -420,7 +420,6 @@ public:
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-	Dw.ZeroCounters();

 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
@@ -589,7 +588,6 @@ public:
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-	Ds.ZeroCounters();

 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
@@ -217,10 +217,10 @@ int main (int argc, char ** argv)
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,
+					      xmit_to_rank,1,
 					      (void *)&rbuf[mu][0],
-					      recv_from_rank,
-					      bytes,mu);
+					      recv_from_rank,1,
+					      bytes,bytes,mu);
 	
 	    comm_proc = mpi_layout[mu]-1;
 	  
@@ -228,10 +228,10 @@ int main (int argc, char ** argv)
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,
+					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,
-					      bytes,mu+4);
+					      recv_from_rank,1,
+					      bytes,bytes,mu+4);
 	  
 	  }
 	}
@@ -309,10 +309,10 @@ int main (int argc, char ** argv)
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,
+					      xmit_to_rank,1,
 					      (void *)&rbuf[mu][0],
-					      recv_from_rank,
-					      bytes,mu);
+					      recv_from_rank,1,
+					      bytes,bytes,mu);
 	    Grid.StencilSendToRecvFromComplete(requests,mu);
 	    requests.resize(0);

@@ -322,10 +322,10 @@ int main (int argc, char ** argv)
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,
+					      xmit_to_rank,1,
 					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,
-					      bytes,mu+4);
+					      recv_from_rank,1,
+					      bytes,bytes,mu+4);
 	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  
@@ -411,8 +411,8 @@ int main (int argc, char ** argv)
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
            int tid = omp_get_thread_num();
-	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-					       (void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
+	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
+					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);

 	    thread_critical { dbytes+=tbytes; }
 	  }
@@ -169,7 +169,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@@ -183,19 +183,16 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  int ncall =1000;

  if (1) {
    FGrid->Barrier();
-    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -239,7 +236,6 @@ int main (int argc, char ** argv)
      exit(-1);
    }
    assert (norm2(err)< 1.0e-4 );
-    Dw.Report();
  }

  if (1)
@@ -301,7 +297,7 @@ int main (int argc, char ** argv)

  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
@@ -314,7 +310,6 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
-    Dw.ZeroCounters();
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
@@ -336,7 +331,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
-    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
@@ -32,18 +32,18 @@
 using namespace std;
 using namespace Grid;

-template<class d>
-struct scal {
-  d internal;
+////////////////////////
+/// Move to domains ////
+////////////////////////
+
+Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
 };

-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-
+void Benchmark(int Ls, Coordinate Dirichlet);

 int main (int argc, char ** argv)
 {
@@ -52,39 +52,115 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  Coordinate latt4 = GridDefaultLatt();
  int Ls=16;
-  for(int i=0;i<argc;i++)
+  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
+  }

+  //////////////////
+  // With comms
+  //////////////////
+  Coordinate Dirichlet(Nd+1,0);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet);
+
+  //////////////////
+  // Domain decomposed
+  //////////////////
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate CommDim(Nd);
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+
+  //////////////////////
+  // Node level
+  //////////////////////
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Benchmark(Ls,Dirichlet);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
+  
+  Benchmark(Ls,Dirichlet);
+
+  Grid_finalize();
+  exit(0);
+}
+void Benchmark(int Ls, Coordinate Dirichlet)
+{
+  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();

  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);

-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+#define SINGLE
+#ifdef SINGLE
+  typedef vComplexF          Simd;
+  typedef LatticeFermionF    FermionField;
+  typedef LatticeGaugeFieldF GaugeField;
+  typedef LatticeColourMatrixF ColourMatrixField;
+  typedef DomainWallFermionF FermionAction;
+#endif
+#ifdef DOUBLE
+  typedef vComplexD          Simd;
+  typedef LatticeFermionD    FermionField;
+  typedef LatticeGaugeFieldD GaugeField;
+  typedef LatticeColourMatrixD ColourMatrixField;
+  typedef DomainWallFermionD FermionAction;
+#endif
+#ifdef DOUBLE2
+  typedef vComplexD2          Simd;
+  typedef LatticeFermionD2    FermionField;
+  typedef LatticeGaugeFieldD2 GaugeField;
+  typedef LatticeColourMatrixD2 ColourMatrixField;
+  typedef DomainWallFermionD2 FermionAction;
+#endif
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

-  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
-  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
-  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

-  LatticeFermionF src   (FGrid); random(RNG5,src);
+ 
+  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
@@ -100,37 +176,38 @@ int main (int argc, char ** argv)
  src = src*N2;
 #endif

-
-  LatticeFermionF result(FGrid); result=Zero();
-  LatticeFermionF    ref(FGrid);    ref=Zero();
-  LatticeFermionF    tmp(FGrid);
-  LatticeFermionF    err(FGrid);
+  FermionField result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
+  FermionField    err(FGrid);

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  LatticeGaugeFieldF Umu(UGrid);
+  GaugeField Umu(UGrid);
+  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
+  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-#if 0
-  Umu=1.0;
-  for(int mu=0;mu<Nd;mu++){
-    LatticeColourMatrixF ttmp(UGrid);
-    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
-    //    if (mu !=2 ) ttmp = 0;
-    //    ttmp = ttmp* pow(10.0,mu);
-    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
-  }
-  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
-#endif

+  ////////////////////////////////////
+  // Apply BCs
+  ////////////////////////////////////
+  Coordinate Block(4);
+  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
+
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
+
+  DirichletFilter<GaugeField> Filter(Block);
+  Filter.applyFilter(Umu);
+  
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
-  // replicate across fifth dimension
-  //  LatticeGaugeFieldF Umu5d(FGrid);
-  std::vector<LatticeColourMatrixF> U(4,UGrid);
+  std::vector<ColourMatrixField> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
+
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;

  if (1)
@@ -177,10 +254,8 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
-  std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
-  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -190,19 +265,21 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

-  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  FermionAction::ImplParams p;
+  p.dirichlet=Dirichlet;
+  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.ImportGauge(Umu);
+  
  int ncall =300;
-
+  RealD n2e;
+  
  if (1) {
    FGrid->Barrier();
-    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -210,8 +287,8 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;

-    auto nsimd = vComplex::Nsimd();
-    auto simdwidth = sizeof(vComplex);
+    auto nsimd = Simd::Nsimd();
+    auto simdwidth = sizeof(Simd);

    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@@ -220,29 +297,19 @@ int main (int argc, char ** argv)
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);

    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
-    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
    err = ref-result;
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //exit(0);
+    n2e = norm2(err);
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;

-    if(( norm2(err)>1.0e-4) ) {
-      /*
-      std::cout << "RESULT\n " << result<<std::endl;
-      std::cout << "REF   \n " << ref   <<std::endl;
-      std::cout << "ERR   \n " << err   <<std::endl;
-      */
+    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
      exit(-1);
    }
-    assert (norm2(err)< 1.0e-4 );
-    Dw.Report();
+    assert (n2e< 1.0e-4 );
  }

  if (1)
@@ -286,26 +353,27 @@ int main (int argc, char ** argv)
    }
    ref = -0.5*ref;
  }
-  //  dump=1;
-  Dw.Dhop(src,result,1);
+
+  Dw.Dhop(src,result,DaggerYes);
+
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
-  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-/*
-	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
-	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
-	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
-*/
-  }
-  LatticeFermionF src_e (FrbGrid);
-  LatticeFermionF src_o (FrbGrid);
-  LatticeFermionF r_e   (FrbGrid);
-  LatticeFermionF r_o   (FrbGrid);
-  LatticeFermionF r_eo  (FGrid);
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+
+  assert((n2e)<1.0e-4);
+  
+  FermionField src_e (FrbGrid);
+  FermionField src_o (FrbGrid);
+  FermionField r_e   (FrbGrid);
+  FermionField r_o   (FrbGrid);
+  FermionField r_eo  (FGrid);

  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
@@ -317,10 +385,8 @@ int main (int argc, char ** argv)

  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
-  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -330,18 +396,11 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
-    Dw.ZeroCounters();
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-#ifdef CUDA_PROFILE
-      if(i==10) cudaProfilerStart();
-#endif
      Dw.DhopEO(src_o,r_e,DaggerNo);
-#ifdef CUDA_PROFILE
-      if(i==20) cudaProfilerStop();
-#endif
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -352,7 +411,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
-    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
@@ -366,14 +424,9 @@ int main (int argc, char ** argv)
  setCheckerboard(r_eo,r_e);

  err = r_eo-result;
-  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-    /*
-	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
-	std::cout<< "Deo REF\n " <<result  << std::endl;
-	std::cout<< "Deo ERR   \n " << err <<std::endl;
-    */
-  }
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
+  assert(n2e<1.0e-4);

  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
@@ -382,6 +435,4 @@ int main (int argc, char ** argv)

  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
-  Grid_finalize();
-  exit(0);
 }
@@ -0,0 +1,387 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  Coordinate latt4 = GridDefaultLatt();
+  int Ls=16;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionF src   (FGrid); random(RNG5,src);
+  LatticeFermionF src1   (FGrid); random(RNG5,src1);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+
+  LatticeFermionF result(FGrid); result=Zero();
+  LatticeFermionF    ref(FGrid);    ref=Zero();
+  LatticeFermionF    tmp(FGrid);
+  LatticeFermionF    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+#if 0
+  Umu=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    LatticeColourMatrixF ttmp(UGrid);
+    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
+    //    if (mu !=2 ) ttmp = 0;
+    //    ttmp = ttmp* pow(10.0,mu);
+    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
+  }
+  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
+#endif
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  // replicate across fifth dimension
+  //  LatticeGaugeFieldF Umu5d(FGrid);
+  std::vector<LatticeColourMatrixF> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  }
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =100;
+
+  if (1) {
+    FGrid->Barrier();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src1,result,0);
+      Dw.Dhop(src,result,0);
+      err = ref-result;
+      std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+      assert (norm2(err)< 1.0e-4 );
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = vComplex::Nsimd();
+    auto simdwidth = sizeof(vComplex);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
+    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
+    err = ref-result;
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    //exit(0);
+
+    if(( norm2(err)>1.0e-4) ) {
+
+      /*
+      std::cout << "RESULT\n " << result<<std::endl;
+      std::cout << "REF   \n " << ref   <<std::endl;
+      std::cout << "ERR   \n " << err   <<std::endl;
+      */
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert (norm2(err)< 1.0e-4 );
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
+	}
+      }
+      
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+    ref = -0.5*ref;
+  }
+  //  dump=1;
+  Dw.Dhop(src,result,1);
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+/*
+	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
+	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
+	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
+*/
+  }
+  LatticeFermionF src_e (FrbGrid);
+  LatticeFermionF src_o (FrbGrid);
+  LatticeFermionF r_e   (FrbGrid);
+  LatticeFermionF r_o   (FrbGrid);
+  LatticeFermionF r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+#ifdef CUDA_PROFILE
+      if(i==10) cudaProfilerStart();
+#endif
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+#ifdef CUDA_PROFILE
+      if(i==20) cudaProfilerStop();
+#endif
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+    /*
+	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
+	std::cout<< "Deo REF\n " <<result  << std::endl;
+	std::cout<< "Deo ERR   \n " << err <<std::endl;
+    */
+  }
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+  Grid_finalize();
+  exit(0);
+}
@@ -0,0 +1,465 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+////////////////////////
+/// Move to domains ////
+////////////////////////
+
+Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
+};
+
+void Benchmark(int Ls, Coordinate Dirichlet, int partial);
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  int Ls=8;
+  for(int i=0;i<argc;i++) {
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+  }
+
+  //////////////////
+  // With comms
+  //////////////////
+  Coordinate Dirichlet(Nd+1,0);
+
+  for(auto partial : {0}) {
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+    Benchmark(Ls,Dirichlet,partial);
+  }
+
+  //////////////////
+  // Domain decomposed
+  //////////////////
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate CommDim(Nd);
+  //Coordinate shm({2,1,1,1});
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+  std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
+
+  //////////////////////
+  // Node level
+  //////////////////////
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+  //  for(int d=0;d<Nd;d++) CommDim[d]= 1;
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  for(auto partial : {0,1}) {
+    std::cout << "\n\n\n\n\n\n" <<std::endl;
+    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+    std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
+    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+    Benchmark(Ls,Dirichlet,partial);
+  }
+  
+  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
+  
+  for(auto partial : {0,1}) {
+    std::cout << "\n\n\n\n\n\n" <<std::endl;
+    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+    std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
+    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+    Benchmark(Ls,Dirichlet,partial);
+  }
+  Grid_finalize();
+  exit(0);
+}
+void Benchmark(int Ls, Coordinate Dirichlet, int partial)
+{
+  Coordinate latt4 = GridDefaultLatt();
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+#define SINGLE
+#ifdef SINGLE
+  typedef vComplexF          Simd;
+  typedef LatticeFermionF    FermionField;
+  typedef LatticeGaugeFieldF GaugeField;
+  typedef LatticeColourMatrixF ColourMatrixField;
+  typedef DomainWallFermionF FermionAction;
+#endif
+#ifdef DOUBLE
+  typedef vComplexD          Simd;
+  typedef LatticeFermionD    FermionField;
+  typedef LatticeGaugeFieldD GaugeField;
+  typedef LatticeColourMatrixD ColourMatrixField;
+  typedef DomainWallFermionD FermionAction;
+#endif
+#ifdef DOUBLE2
+  typedef vComplexD2          Simd;
+  typedef LatticeFermionD2    FermionField;
+  typedef LatticeGaugeFieldD2 GaugeField;
+  typedef LatticeColourMatrixD2 ColourMatrixField;
+  typedef DomainWallFermionD2 FermionAction;
+#endif
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+
+ 
+  FermionField src   (FGrid); random(RNG5,src);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+  FermionField result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
+  FermionField    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  GaugeField Umu(UGrid);
+  GaugeField UmuFull(UGrid);
+  GaugeField UmuCopy(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  UmuCopy=Umu;
+  UmuFull=Umu;
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+  ////////////////////////////////////
+  // Apply BCs
+  ////////////////////////////////////
+  Coordinate Block(4);
+  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
+
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
+
+  DirichletFilter<GaugeField> Filter(Block);
+  Filter.applyFilter(Umu);
+  if(!partial) Filter.applyFilter(UmuCopy);
+  
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  std::vector<ColourMatrixField> U(4,UGrid);
+  std::vector<ColourMatrixField> Ucopy(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
+  }
+
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+      int depth=dwf_compressor_depth;
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v    , U[mu]  , CpuRead);
+	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    if ( (s<depth) || (s>=Ls-depth)){
+	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
+	    } else {
+	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	    }
+	  }
+	}
+      }
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    if ( (s<depth) || (s>=Ls-depth)){
+	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
+	    } else {
+	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	    }
+	  }
+	}
+      }
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
+  std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  FermionAction::ImplParams p;
+  p.dirichlet=Dirichlet;
+  p.partialDirichlet=partial;
+  FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  
+  int ncall =1;
+  RealD n2e;
+  
+  if (1) {
+    FGrid->Barrier();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = Simd::Nsimd();
+    auto simdwidth = sizeof(Simd);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    err = ref-result;
+    n2e = norm2(err);
+
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+
+    if(( n2e>1.0e-4) ) {
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+
+      DumpSliceNorm("s-slice ref ",ref,1);
+      DumpSliceNorm("s-slice res ",result,1);
+      DumpSliceNorm("s-slice error ",err,1);
+      exit(-1);
+    }
+    assert (n2e< 1.0e-4 );
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      int depth=dwf_compressor_depth;
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v    , U[mu]  , CpuRead);
+	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    if ( (s<depth) || (s>=Ls-depth)){
+	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
+	    } else {
+	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	    }
+	  }
+	}
+      }
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    if ( (s<depth) || (s>=Ls-depth)){
+	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
+	    } else {
+	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	    }
+	  }
+	}
+      }
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  Dw.Dhop(src,result,DaggerYes);
+
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+
+  assert((n2e)<1.0e-4);
+  
+  FermionField src_e (FrbGrid);
+  FermionField src_o (FrbGrid);
+  FermionField r_e   (FrbGrid);
+  FermionField r_o   (FrbGrid);
+  FermionField r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
+  assert(n2e<1.0e-4);
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+}
@@ -168,7 +168,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;

-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
  double t0=usecond();
  Dw.Dhop(src,result,0);
@@ -93,14 +93,11 @@ int main (int argc, char ** argv)
  int ncall =1000;
  if (1) {
    FGrid->Barrier();
-    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -114,7 +111,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    Dw.Report();
  }


@@ -136,14 +132,11 @@ int main (int argc, char ** argv)
  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
  if (1) {
    FGrid_d->Barrier();
-    DwD.ZeroCounters();
    DwD.Dhop(src_d,result_d,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      DwD.Dhop(src_d,result_d,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid_d->Barrier();
@@ -157,7 +150,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    DwD.Report();
  }
 #endif
  Grid_finalize();
@@ -67,17 +67,17 @@ int main (int argc, char ** argv)
    const int ncall=1000;

    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
+    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;

    GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
    LatticeFermion src(FGrid); random(RNG5,src);
    LatticeFermion result(FGrid);

-    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+    DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    double t0,t1;
    
-    typedef typename DomainWallFermionR::Coeff_t Coeff_t;
+    typedef typename DomainWallFermionD::Coeff_t Coeff_t;
    Vector<Coeff_t> diag = Dw.bs;
    Vector<Coeff_t> upper= Dw.cs;
    Vector<Coeff_t> lower= Dw.cs;
@@ -103,35 +103,30 @@ int main (int argc, char ** argv)
 #define BENCH_DW(A,...)			\
    Dw. A (__VA_ARGS__);				\
    FGrid->Barrier();				\
-    Dw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      Dw. A (__VA_ARGS__);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

 #define BENCH_ZDW(A,in,out)			\
    zDw. A (in,out);				\
    FGrid->Barrier();				\
-    zDw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      zDw. A (in,out);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    zDw.CayleyReport();							\
    std::cout<<GridLogMessage << "Called ZDw " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

 #define BENCH_DW_SSC(A,in,out)			\
    Dw. A (in,out);				\
    FGrid->Barrier();				\
-    Dw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      __SSC_START ;				\
@@ -140,7 +135,6 @@ int main (int argc, char ** argv)
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

@@ -0,0 +1,189 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_prec_change.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int Ls = 12;
+  Coordinate latt4 = GridDefaultLatt();
+
+  GridCartesian         * UGridD   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
+  GridCartesian         * FGridD   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGridD);  RNG4.SeedFixedIntegers(seeds4);
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGridD);  RNG5.SeedFixedIntegers(seeds5);
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionD field_d(FGridD), tmp_d(FGridD);
+  random(RNG5,field_d); tmp_d = field_d;
+
+  LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
+  precisionChange(field_d2, field_d); tmp_d2 = field_d2;
+
+  LatticeFermionF field_f(FGridF), tmp_f(FGridF);
+  precisionChange(field_f, field_d); tmp_f = field_f;
+
+  int N = 500;
+
+  double time_ds = 0, time_sd = 0;
+
+  std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;
+  for(int i=0;i<N;i++){
+    //We want to benchmark the typical scenario of both fields being device resident
+    //To do this, invoke an operation that will open a device view and touch all sites
+    //with a write operation that invalidates the CPU copy
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChangeOrig(field_d,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChangeOrig(field_f,field_d);
+    stop=usecond();
+    time_ds += stop - start;   
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+
+  precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
+  precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
+  
+  std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChange(field_d,field_f, wk_sp_to_dp);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChange(field_f,field_d, wk_dp_to_sp);
+    stop=usecond();
+    time_ds += stop - start;   
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+  
+  std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChange(field_d,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d = tmp_d;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChange(field_f,field_d);
+    stop=usecond();
+    time_ds += stop - start;
+
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+
+  std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChangeFast(field_d2,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChangeFast(field_f,field_d2);
+    stop=usecond();
+    time_ds += stop - start;
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+
+  std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;
+  time_sd = time_ds = 0;
+  for(int i=0;i<N;i++){
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    double start=usecond();
+    precisionChange(field_d2,field_f);
+    double stop=usecond();
+    time_sd += stop - start;
+
+    field_d2 = tmp_d2;
+    field_f = tmp_f;
+
+    start=usecond();
+    precisionChange(field_f,field_d2);
+    stop=usecond();
+    time_ds += stop - start;
+  }
+  std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
+
+  Grid_finalize();
+}
@@ -53,8 +53,8 @@ int main (int argc, char ** argv)
  pRNG.SeedFixedIntegers(seeds);
  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});

-  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
-  typename ImprovedStaggeredFermionR::ImplParams params; 
+  typedef typename ImprovedStaggeredFermionD::FermionField FermionField; 
+  typename ImprovedStaggeredFermionD::ImplParams params; 

  FermionField src   (&Grid); random(pRNG,src);
  FermionField result(&Grid); result=Zero();
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
-  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
+  ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
  
  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
  int ncall=1000;
@@ -146,16 +146,15 @@ int main (int argc, char ** argv)
  ref = -0.5*ref;
  RealD mass=0.1;

-  typename WilsonFermionR::ImplParams params;
+  typename WilsonFermionD::ImplParams params;

-  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
+  WilsonFermionD Dw(Umu,Grid,RBGrid,mass,params);

  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=1000;
  //int ncall=1;

  // Counters
-  Dw.ZeroCounters();
  Grid.Barrier();

  double t0=usecond();
@@ -201,7 +200,6 @@ int main (int argc, char ** argv)
  err = ref-result;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

-  Dw.Report();
  
  // guard
  double err0 = norm2(err);
@@ -40,21 +40,21 @@ Gamma::Algebra Gmu [] = {
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
-		   WilsonFermionR &     Dw,
+		   WilsonFermionD &     Dw,
 		   double const     volume,
 		   int const           dag );

 void bench_wilson_eo (
       LatticeFermion &    src,
       LatticeFermion & result,
-       WilsonFermionR &     Dw,
+       WilsonFermionD &     Dw,
       double const     volume,
       int const           dag );

 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-  typename WilsonFermionR::ImplParams params;
+  typename WilsonFermionD::ImplParams params;

  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
@@ -66,7 +66,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Number of colours "<< Nc <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionD::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
@@ -110,7 +110,7 @@ int main (int argc, char ** argv)

 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());

-	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
+	  WilsonFermionD Dw(Umu,Grid,RBGrid,mass,params);
      
    // Full operator      
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
@@ -130,7 +130,7 @@ int main (int argc, char ** argv)
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
-		   WilsonFermionR &     Dw,
+		   WilsonFermionD &     Dw,
 		   double const     volume,
 		   int const           dag )
 {
@@ -149,7 +149,7 @@ void bench_wilson (
 void bench_wilson_eo (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
-		   WilsonFermionR &     Dw,
+		   WilsonFermionD &     Dw,
 		   double const     volume,
 		   int const           dag )
 {