10TF/s on 32^3 x 64 on single node

Copy stream HIP improvements
Simplify dead code
2026-01-21 09:14:41 +00:00 · 2022-08-04 15:43:52 -04:00 · 2022-08-04 15:24:03 -04:00 · 2022-08-04 15:23:13 -04:00 · 2022-08-04 13:37:16 -04:00 · 2022-08-04 13:36:44 -04:00
25 changed files with 673 additions and 134 deletions
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -117,6 +117,7 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;

+    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
@@ -166,14 +167,16 @@ public:

      // Stopping condition
      if (cp <= rsq) {
+	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-
+	GridBase *grid = src.Grid();
+	RealD DwfFlops = (1452. )*grid->gSites()*4*k
+   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
-
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
@@ -187,6 +190,8 @@ public:
 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

+	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
+
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

 	IterationsToComplete = k;	
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -40,7 +40,7 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -3,8 +3,14 @@

 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
-#define dprintf(...)
+
+#define MAXLINE 512
+static char print_buffer [ MAXLINE ];
+
+#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
+//#define dprintf(...) printf (__VA_ARGS__ ); fflush(stdout);
+#define dprintf(...) 
+


 ////////////////////////////////////////////////////////////
@@ -104,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -126,7 +132,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
@@ -150,7 +156,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -165,7 +171,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -107,6 +107,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
+  static void BarrierWorld(void);
  
  ////////////////////////////////////////////////////////////
  // Reduction
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -396,17 +396,17 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    }
  }
  
-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
-    this->StencilSendToRecvFromComplete(list,dir);
-    list.resize(0);
-  }
-
+  /*  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+   *    this->StencilSendToRecvFromComplete(list,dir);
+   *    list.resize(0);
+   *  }
+   */
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
+  StencilBarrier();// Synch shared memory on a single nodes

  int nreq=list.size();

@@ -443,6 +443,10 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
+void CartesianCommunicator::BarrierWorld(void){
+  int ierr = MPI_Barrier(communicator_world);
+  assert(ierr==0);
+}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -104,6 +104,7 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -68,6 +68,7 @@ GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
+GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
@@ -80,6 +81,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
+  GridLogDslash.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
@@ -91,6 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -138,7 +138,8 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left;
+      //      stream << log.colour() <<  std::left;
+      stream <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
@@ -153,9 +154,9 @@ public:
 	stream << log.evidence()
 	       << now	       << log.background() << " : " ;
      }
-      stream << log.colour();
+      //      stream << log.colour();
+      stream <<  std::right;
      stream.flags(f);
-
      return stream;
    } else { 
      return devnull;
@@ -180,6 +181,7 @@ extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
+extern GridLogger GridLogDslash;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
--- a/Grid/perfmon/PerfCount.cc
+++ b/Grid/perfmon/PerfCount.cc
@@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */

 #include <Grid/GridCore.h>
-#include <Grid/perfmon/PerfCount.h>

+#include <Grid/perfmon/Timer.h>
+#include <Grid/perfmon/PerfCount.h>
 NAMESPACE_BEGIN(Grid);

+GridTimePoint theProgramStart = GridClock::now();
+
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid)

-// Dress the output; use std::chrono
-// C++11 time facilities better?
-inline double usecond(void) {
-  struct timeval tv;
-  tv.tv_sec = 0;
-  tv.tv_usec = 0;
-  gettimeofday(&tv,NULL);
-  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
-}
-
-typedef  std::chrono::system_clock          GridClock;
+//typedef  std::chrono::system_clock          GridClock;
+typedef  std::chrono::high_resolution_clock   GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;

 typedef  std::chrono::seconds               GridSecs;
@@ -53,6 +44,15 @@ typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;

+extern GridTimePoint theProgramStart;
+// Dress the output; use std::chrono
+// C++11 time facilities better?
+inline double usecond(void) {
+  auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart); 
+  return 1.0*usecs.count();
+}
+
+
 inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
  stream << time.count()<<" s";
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -42,6 +42,8 @@ public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
+  RealD Fdt_norm_sum;
+  RealD Fdt_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
@@ -51,12 +53,17 @@ public:
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
  }
-  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
-  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
-  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
+  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
+    deriv_max_sum+=max; deriv_norm_sum+=nrm;
+    Fdt_max_sum+=Fdt_max; Fdt_norm_sum+=Fdt_nrm; deriv_num++;
+  }
+  RealD deriv_max_average(void)       { return deriv_max_sum/deriv_num; };
+  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
+  RealD Fdt_max_average(void)         { return Fdt_max_sum/deriv_num; };
+  RealD Fdt_norm_average(void)        { return Fdt_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
-  RealD S_timer(void)            { return deriv_us; };
-  RealD refresh_timer(void)      { return deriv_us; };
+  RealD S_timer(void)            { return S_us; };
+  RealD refresh_timer(void)      { return refresh_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -39,7 +39,7 @@ struct GparityWilsonImplParams {
  Coordinate twists;
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {};
+  GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); };
 };
  
 struct WilsonImplParams {
@@ -48,13 +48,13 @@ struct WilsonImplParams {
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
  }
 };

@@ -62,7 +62,7 @@ struct StaggeredImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  StaggeredImplParams()
  {
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
  };
 };
  
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -400,7 +400,6 @@ public:
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
-    accelerator_barrier();
  }

 };
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -233,10 +233,10 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
-    std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
+    std::cout << GridLogDslash << " Dirichlet BCs 5d " <<Block<<std::endl;
    Coordinate GaugeBlock(Nd);
    for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
-    std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
+    std::cout << GridLogDslash << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
    DirichletFilter<GaugeField> Filter(GaugeBlock);
    Filter.applyFilter(HUmu);
  }
@@ -382,12 +382,14 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
-  DhopTotalTime-=usecond();
+  //  std::cout << GridLogDslash<<"Dhop internal"<<std::endl;
+  DhopTotalTime=-usecond();
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
+  //  std::cout << GridLogDslash<<"Dhop took"<<DhopTotalTime<<std::endl;
 }


@@ -404,53 +406,59 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  DhopFaceTime=-usecond();
  st.HaloExchangeOptGather(in,compressor);
  DhopFaceTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Gather end "<< DhopFaceTime<<" us " <<std::endl;

-  DhopCommTime -=usecond();
+  DhopCommTime =-usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  DhopFaceTime=-usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Commsmerge end "<<DhopFaceTime<< " us "<<std::endl;
      
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
-  DhopComputeTime-=usecond();
+  DhopComputeTime=-usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Compute 1 end "<< DhopComputeTime<<" us" <<std::endl;

  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
  DhopCommTime   +=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Comunicate end "<< DhopCommTime << " us" <<std::endl;

  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  DhopFaceTime=-usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop CommsMerge2 end "<<DhopFaceTime << " us "<<std::endl;

-  DhopComputeTime2-=usecond();
+  DhopComputeTime2=-usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Ext end "<<DhopComputeTime2 <<"us  "<<std::endl;
 }


@@ -463,12 +471,14 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
-  
-  DhopCommTime-=usecond();
+
+  //  std::cout << GridLogDslash<< " Dhop Halo exchange begine " <<std::endl;
+  DhopCommTime=-usecond();
  st.HaloExchangeOpt(in,compressor);
  DhopCommTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Comms end "<<DhopCommTime<<" us"<<std::endl;
  
-  DhopComputeTime-=usecond();
+  DhopComputeTime=-usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
@@ -476,6 +486,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Compute end "<<DhopComputeTime<<" us" <<std::endl;
 }


--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -416,19 +416,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

-#define KERNEL_CALL_TMP(A) \
-  const uint64_t    NN = Nsite*Ls;					\
-  auto U_p = & U_v[0];							\
-  auto in_p = & in_v[0];						\
-  auto out_p = & out_v[0];						\
-  auto st_p = st_v._entries_p;						\
-  auto st_perm = st_v._permute_type;					\
-  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
-      int sF = ss;							\
-      int sU = ss/Ls;							\
-      WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p);	\
-    });									\
-  accelerator_barrier();

 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@@ -448,8 +435,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
      int sF = ptr[ss];							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-    });									\
-  accelerator_barrier();
+    });									

 #define ASM_CALL(A)							\
  thread_for( ss, Nsite, {						\
@@ -471,7 +457,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifdef SYCL_HACK     
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; }
 #else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #endif     
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -67,6 +67,36 @@ NAMESPACE_BEGIN(Grid);
      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
    };

+    template<class Impl,class ImplF>
+    class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
+    public:
+      typedef OneFlavourRationalParams Params;
+    private:
+      static RationalActionParams transcribe(const Params &in){
+	RationalActionParams out;
+	out.inv_pow = 2;
+	out.lo = in.lo;
+	out.hi = in.hi;
+	out.MaxIter = in.MaxIter;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
+	out.action_degree = out.md_degree = in.degree;
+	out.precision = in.precision;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
+	return out;
+      }
+
+    public:
+      OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+								 FermionOperator<Impl>  &_DenOp, 
+								 FermionOperator<ImplF>  &_NumOpF, 
+								 FermionOperator<ImplF>  &_DenOpF, 
+								 const Params & p, Integer ReliableUpdateFreq
+							) : 
+	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){}
+
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
+    };
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -153,7 +153,7 @@ protected:
      Real force_max   = std::sqrt(maxLocalNorm2(force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    

-      as[level].actions.at(a)->deriv_log(force_abs,force_max);
+      as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
      
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
@@ -285,6 +285,8 @@ public:
 		  <<"["<<level<<"]["<< actionID<<"] : "
 		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
+		  <<" Fdt max  "  << as[level].actions.at(actionID)->Fdt_max_average()
+		  <<" norm "      << as[level].actions.at(actionID)->Fdt_norm_average()
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -290,6 +290,8 @@ public:
  std::vector<Decompress> DecompressionsSHM;
  std::vector<CopyReceiveBuffer> CopyReceiveBuffers ;
  std::vector<CachedTransfer> CachedTransfers;
+  std::vector<CommsRequest_t> MpiReqs;
+  
  ///////////////////////////////////////////////////////////
  // Unified Comms buffers for all directions
  ///////////////////////////////////////////////////////////
@@ -357,9 +359,9 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    reqs.resize(Packets.size());
+    accelerator_barrier();
    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromBegin(reqs[i],
+      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].recv_buf,
@@ -370,41 +372,19 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromComplete(reqs[i],i);
-    }
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
-      /////////////////////////////////////////////////////////
-      // several way threaded on different communicators.
-      // Cannot combine with Dirichlet operators
-      // This scheme is needed on Intel Omnipath for best performance
-      // Deprecate once there are very few omnipath clusters
-      /////////////////////////////////////////////////////////
-      int nthreads = CartesianCommunicator::nCommThreads;
-      int old = GridThread::GetThreads();
-      GridThread::SetThreads(nthreads);
-      thread_for(i,Packets.size(),{
-	  _grid->StencilSendToRecvFrom(Packets[i].send_buf,
-				       Packets[i].to_rank,Packets[i].do_send,
-				       Packets[i].recv_buf,
-				       Packets[i].from_rank,Packets[i].do_recv,
-				       Packets[i].bytes,i);
-      });
-      GridThread::SetThreads(old);
-    } else { 
-      /////////////////////////////////////////////////////////
-      // Concurrent and non-threaded asynch calls to MPI
-      /////////////////////////////////////////////////////////
-      std::vector<std::vector<CommsRequest_t> > reqs;
-      this->CommunicateBegin(reqs);
-      this->CommunicateComplete(reqs);
-    }
+    /////////////////////////////////////////////////////////
+    // Concurrent and non-threaded asynch calls to MPI
+    /////////////////////////////////////////////////////////
+    std::vector<std::vector<CommsRequest_t> > reqs;
+    this->CommunicateBegin(reqs);
+    this->CommunicateComplete(reqs);
  }

  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
@@ -484,7 +464,6 @@ public:
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);

-    accelerator_barrier();
  }

  /////////////////////////
@@ -499,6 +478,7 @@ public:
    Packets.resize(0);
    CopyReceiveBuffers.resize(0);
    CachedTransfers.resize(0);
+    MpiReqs.resize(0);
  }
  void AddCopy(void *from,void * to, Integer bytes)
  {
@@ -711,7 +691,9 @@ public:
    this->_comms_recv.resize(npoints); 
    this->same_node.resize(npoints);

-    if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up
+    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
+
+    DirichletBlock(p.dirichlet); // comms send/recv set up

    _unified_buffer_size=0;
    surface_list.resize(0);
@@ -793,7 +775,6 @@ public:
      u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
      u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    }
-
    PrecomputeByteOffsets();
  }

@@ -1105,7 +1086,6 @@ public:
 	  // Gather locally
 	  ////////////////////////////////////////////////////////
 	  assert(send_buf!=NULL);
-
 	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
 	}

@@ -1212,8 +1192,9 @@ public:
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
 	}

-	if ( comms_send || comms_recv )
+	if ( comms_send || comms_recv ) {
 	  Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
+	}
 	face_idx++;

 	//spointers[0] -- low
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -1,6 +1,7 @@
 #include <Grid/GridCore.h>

 NAMESPACE_BEGIN(Grid);
+int      world_rank; // Use to control world rank for print guarding
 int      acceleratorAbortOnGpuError=1;
 uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
@@ -16,7 +17,7 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
-cudaStream_t cpuStream;
+cudaStream_t computeStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -24,7 +25,8 @@ void acceleratorInit(void)
  gpu_props = new cudaDeviceProp[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_SLURM  )) != NULL) { world_rank = atoi(localRankStr);}
@@ -99,7 +101,7 @@ void acceleratorInit(void)

  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
-  cudaStreamCreate(&cpuStream);
+  cudaStreamCreate(&computeStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@@ -114,7 +116,7 @@ void acceleratorInit(void)
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
-hipStream_t cpuStream;
+hipStream_t computeStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -122,7 +124,8 @@ void acceleratorInit(void)
  gpu_props = new hipDeviceProp_t[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
@@ -183,7 +186,7 @@ void acceleratorInit(void)
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
-  hipStreamCreate(&cpuStream);
+  hipStreamCreate(&computeStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@@ -210,7 +213,8 @@ void acceleratorInit(void)
 #endif
  
  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 

  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -107,7 +107,7 @@ void     acceleratorInit(void);

 extern int acceleratorAbortOnGpuError;
 extern cudaStream_t copyStream;
-extern cudaStream_t cpuStream;
+extern cudaStream_t computeStream;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -135,7 +135,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda);	\
+    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
  }

 #define accelerator_for6dNB(iter1, num1,				\
@@ -154,7 +154,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_blocks (num1,num2,num3);					\
    dim3 cu_threads(num4,num5,num6);					\
-    Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
+    Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
  }

 template<typename lambda>  __global__
@@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,

 #define accelerator_barrier(dummy)					\
  {									\
-    cudaStreamSynchronize(cpuStream);					\
+    cudaStreamSynchronize(computeStream);				\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
      printf("accelerator_barrier(): Cuda error %s \n",			\
@@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator_inline __host__ __device__ inline

 extern hipStream_t copyStream;
-extern hipStream_t cpuStream;
+extern hipStream_t computeStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -362,16 +362,15 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-			 0,cpuStream,					\
+   	                 0,computeStream,						\
 			 num1,num2,nsimd, lambda);			\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
-			 0,cpuStream,					\
+			 0,computeStream,				\
 			 num1,num2,nsimd, lambda);			\
    } \
  }

-
 template<typename lambda>  __global__
 __launch_bounds__(64,1)
 void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
@@ -400,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 #define accelerator_barrier(dummy)				\
  {								\
-    hipStreamSynchronize(cpuStream);					\
+    hipStreamSynchronize(computeStream);			\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -443,7 +442,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);
+  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
 inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -356,6 +356,11 @@ void Grid_init(int *argc,char ***argv)
  //////////////////////////////////////////////////////////
  CartesianCommunicator::Init(argc,argv);

+  GridLogger::GlobalStopWatch.Stop();
+  CartesianCommunicator::BarrierWorld();
+  GridLogger::GlobalStopWatch.Reset();// Back to zero with synchronised clock
+  GridLogger::GlobalStopWatch.Start();
+
  ////////////////////////////////////
  // Banner after MPI (unless GPU)
  ////////////////////////////////////
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
@@ -128,8 +128,14 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#if 1
+      RealD delta=1.e-4;
+      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
+#else      
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#endif
      MPCG(src,psi);
    }
  };
@@ -141,6 +147,10 @@ int main(int argc, char **argv) {
  using namespace Grid;

  Grid_init(&argc, &argv);
+
+  CartesianCommunicator::BarrierWorld();
+  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
+  
  int threads = GridThread::GetThreads();

   // Typedefs to simplify notation
@@ -161,7 +171,7 @@ int main(int argc, char **argv) {
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps =  4;
+  MD.MDsteps =  6;
  MD.trajL   = 1.0;

  HMCparameters HMCparams;
@@ -183,7 +193,7 @@ int main(int argc, char **argv) {
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
+  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
@@ -204,7 +214,8 @@ int main(int argc, char **argv) {
  Real light_mass   = 7.8e-4;
  Real strange_mass = 0.02132;
  Real pv_mass      = 1.0;
-  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });

  // FIXME:
  // Same in MC and MD
@@ -287,6 +298,7 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
+  std::cout << "loaded NERSC gauge field"<<std::endl;


  // These lines are unecessary if BC are all periodic
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc
@@ -0,0 +1,474 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+      /* Debugging instances of objects; references are stored
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+      */
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+      // Assumption made in code to extract gauge field
+      // We could avoid storing LinopD reference alltogether ?
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Must snarf a single precision copy of the gauge field in Linop_d argument
+      ////////////////////////////////////////////////////////////////////////////////////
+      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+
+      GridBase * GridPtrF = SinglePrecGrid4;
+      GridBase * GridPtrD = FermOpD.Umu.Grid();
+      GaugeFieldF     U_f  (GridPtrF);
+      GaugeLinkFieldF Umu_f(GridPtrF);
+      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Moving this to a Clone method of fermion operator would allow to duplicate the 
+      // physics parameters and decrease gauge field copies
+      ////////////////////////////////////////////////////////////////////////////////////
+      GaugeLinkFieldD Umu_d(GridPtrD);
+      for(int mu=0;mu<Nd*2;mu++){ 
+	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+	precisionChange(Umu_f,Umu_d);
+	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      }
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+#if 1
+      RealD delta=1.e-4;
+      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
+#else      
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#endif
+      MPCG(src,psi);
+    }
+  };
+
+NAMESPACE_END(Grid);
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonImplF FermionImplPolicyF;
+
+  typedef MobiusFermionR FermionAction;
+  typedef MobiusFermionF FermionActionF;
+  typedef typename FermionAction::FermionField FermionField;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps =  6;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 1077;
+  HMCparams.Trajectories     = 1;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_DDHMC_lat";
+  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  RealD M5  = 1.8;
+  RealD b   = 1.5;
+  RealD c   = 0.5;
+  Real beta         = 2.31;
+  //  Real light_mass   = 5.4e-4;
+  Real light_mass   = 7.8e-4;
+  Real strange_mass = 0.02132;
+  Real pv_mass      = 1.0;
+  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+
+  // FIXME:
+  // Same in MC and MD
+  // Need to mix precision too
+  OneFlavourRationalParams SFRp; // Strange
+  SFRp.lo       = 4.0e-3;
+  SFRp.hi       = 90.0;
+  SFRp.MaxIter  = 60000;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 1.0e-6;
+  SFRp.degree   = 12;
+  SFRp.precision= 50;
+  SFRp.BoundsCheckFreq=0;
+
+  OneFlavourRationalParams OFRp; // Up/down
+  OFRp.lo       = 2.0e-5;
+  OFRp.hi       = 90.0;
+  OFRp.MaxIter  = 60000;
+  OFRp.tolerance= 1.0e-8;
+  OFRp.mdtolerance= 1.0e-6;
+  //  OFRp.degree   = 20; converges
+  //  OFRp.degree   = 16;
+  OFRp.degree   = 12;
+  OFRp.precision= 80;
+  OFRp.BoundsCheckFreq=0;
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
+
+  ////////////////////////////////////////////////////////////////
+  // Domain decomposed
+  ////////////////////////////////////////////////////////////////
+  Coordinate latt4  = GridPtr->GlobalDimensions();
+  Coordinate mpi    = GridPtr->ProcessorGrid();
+  Coordinate shm;
+
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+  
+  Coordinate CommDim(Nd);
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+
+  Coordinate NonDirichlet(Nd+1,0);
+  Coordinate Dirichlet(Nd+1,0);
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Coordinate Block4(Nd);
+  Block4[0] = Dirichlet[1];
+  Block4[1] = Dirichlet[2];
+  Block4[2] = Dirichlet[3];
+  Block4[3] = Dirichlet[4];
+
+  int Width=3;
+  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4,Width));
+
+  //////////////////////////
+  // Fermion Grids
+  //////////////////////////
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi);
+  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeFieldF UF(GridPtrF);
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  Params.dirichlet=NonDirichlet;
+  FermionAction::ImplParams ParamsDir(boundary);
+  ParamsDir.dirichlet=Dirichlet;
+
+  //  double StoppingCondition = 1e-14;
+  //  double MDStoppingCondition = 1e-9;
+  double StoppingCondition = 1e-10;
+  double MDStoppingCondition = 1e-7;
+  double MDStoppingConditionLoose = 1e-6;
+  double MaxCGIterations = 300000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  ActionLevel<HMCWrapper::Field> Level3(8);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir);
+  FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, ParamsDir);
+  
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
+  Level1.push_back(&StrangePseudoFermionBdy); // ok
+  Level2.push_back(&StrangePseudoFermionLocal);
+  Level1.push_back(&StrangePseudoFermionPVBdy); //ok
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+  std::vector<int> dirichlet_den;
+  std::vector<int> dirichlet_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
+  }
+
+  for(int h=0;h<n_hasenbusch;h++){
+    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
+  }
+  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionActionF *> NumeratorsF;
+  std::vector<FermionAction *> Denominators;
+  std::vector<FermionActionF *> DenominatorsF;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+#define MIXED_PRECISION
+#ifdef MIXED_PRECISION
+  std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
+#else
+  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+#endif
+  std::vector<MxPCG *> ActionMPCG;
+  std::vector<MxPCG *> MPCG;
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  std::vector<LinearOperatorD *> LinOpD;
+  std::vector<LinearOperatorF *> LinOpF; 
+  
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage
+	      << " 2f quotient Action ";
+    std::cout << "det D("<<light_den[h]<<")";
+    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
+    std::cout << "/ det D("<<light_num[h]<<")";
+    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
+    std::cout << std::endl;
+
+    FermionAction::ImplParams ParamsNum(boundary);
+    FermionAction::ImplParams ParamsDen(boundary);
+    FermionActionF::ImplParams ParamsNumF(boundary);
+    FermionActionF::ImplParams ParamsDenF(boundary);
+    
+    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
+    else                      ParamsNum.dirichlet = NonDirichlet;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
+
+    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
+    else                      ParamsDen.dirichlet = NonDirichlet;
+
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
+
+    ParamsDenF.dirichlet = ParamsDen.dirichlet;
+    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
+
+    ParamsNumF.dirichlet = ParamsNum.dirichlet;
+    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
+
+    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
+    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
+
+    double conv  = MDStoppingCondition;
+    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
+    const int MX_inner = 5000;
+    MPCG.push_back(new MxPCG(conv,
+			     MX_inner,
+			     MaxCGIterations,
+			     GridPtrF,
+			     FrbGridF,
+			     *DenominatorsF[h],*Denominators[h],
+			     *LinOpF[h], *LinOpD[h]) );
+
+    ActionMPCG.push_back(new MxPCG(StoppingCondition,
+				   MX_inner,
+				   MaxCGIterations,
+				   GridPtrF,
+				   FrbGridF,
+				   *DenominatorsF[h],*Denominators[h],
+				   *LinOpF[h], *LinOpD[h]) );
+
+    
+    if(h!=0) {
+      //      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
+      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
+    } else {
+#ifdef MIXED_PRECISION
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
+			   *Numerators[h],*Denominators[h],
+			   *NumeratorsF[h],*DenominatorsF[h],
+			   OFRp, 500) );
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
+			   *Numerators[h],*Denominators[h],
+			   *NumeratorsF[h],*DenominatorsF[h],
+			   OFRp, 500) );
+#else
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+#endif
+    }
+  }
+
+  int nquo=Quotients.size();
+  Level1.push_back(Bdys[0]);
+  Level1.push_back(Bdys[1]);
+  for(int h=0;h<nquo-1;h++){
+    Level2.push_back(Quotients[h]);
+  }
+  Level2.push_back(Quotients[nquo-1]);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+  /////////////////////////////////////////////////////////////
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -98,9 +98,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -141,9 +139,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      DwD.Dhop(src_d,result_d,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid_d->Barrier();
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -95,26 +95,34 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  double t1,t2,flops;
+  double MdagMsiteflops = 1452; // Mobius (real coeffs)
+  // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
+  double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
+  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
+  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;
-  for(int i=0;i<100;i++){
+  for(int i=0;i<200;i++){
    result_o = Zero();
    t1=usecond();
    mCG(src_o,result_o);
    t2=usecond();
    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
-    flops = 1320.0*2*FGrid->gSites()*iters;
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
+    flops+= CGsiteflops*FrbGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  for(int i=0;i<100;i++){
+  for(int i=0;i<1;i++){
    result_o_2 = Zero();
    t1=usecond();
    CG(HermOpEO,src_o,result_o_2);
    t2=usecond();
    iters = CG.IterationsToComplete;
-    flops = 1320.0*2*FGrid->gSites()*iters;
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
+    flops+= CGsiteflops*FrbGrid->gSites()*iters;
+    
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
Author	SHA1	Message	Date
Peter Boyle	95b640cb6b	10TF/s on 32^3 x 64 on single node	2022-08-04 15:43:52 -04:00
Peter Boyle	2cb5bedc15	Copy stream HIP improvements	2022-08-04 15:24:03 -04:00
Peter Boyle	806b02bddf	Simplify dead code	2022-08-04 15:23:13 -04:00
Peter Boyle	de40395773	More timing. Think I should start to use nvtx and rocmtx ??	2022-08-04 13:37:16 -04:00
Peter Boyle	7ba4788715	Fix	2022-08-04 13:36:44 -04:00
Peter Boyle	06d9ce1a02	Synch ranks on node here for GPU - GPU memcopy	2022-08-04 13:35:56 -04:00
Peter Boyle	75bb6b2b40	Move barrier into the StencilSend begin routine	2022-08-04 13:35:26 -04:00
Peter Boyle	74f10c2dc0	Move barrier into Stencil Send	2022-08-04 13:34:11 -04:00
Peter Boyle	a93d5459d4	Better mpi request completion	2022-07-28 12:18:35 -04:00
Peter Boyle	9c21add0c6	High res timer replaces getttimeofday	2022-07-28 12:14:03 -04:00
Peter Boyle	639aab6563	High res timer instead of gettimeofday	2022-07-28 12:13:35 -04:00
Peter Boyle	8137cc7049	Allways concurrent comms	2022-07-28 12:01:51 -04:00
Peter Boyle	60e63dca1d	Add memory logging channel	2022-07-28 11:39:15 -04:00
Peter Boyle	486409574e	Expanded cach to avoid any allocs in HMC	2022-07-28 11:38:34 -04:00
Peter Boyle	a913b8be12	Dslash self timing. Might want to not have this	2022-07-28 11:37:55 -04:00
Peter Boyle	2239751850	Better logging	2022-07-28 11:37:36 -04:00
Peter Boyle	9b20f1449c	Better timing	2022-07-28 11:37:12 -04:00
Peter Boyle	b99453083d	Updated timing	2022-07-28 11:37:02 -04:00
Peter Boyle	943fbb914d	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet	2022-07-11 13:48:42 -04:00
Peter Boyle	ca4603580d	Verbose	2022-07-11 13:48:35 -04:00
Peter Boyle	f73db8f1f3	Synch clocks	2022-07-11 13:47:39 -04:00
Peter Boyle	f7217d12d2	World barrier for clock synch	2022-07-11 13:45:31 -04:00
Peter Boyle	fab50c57d9	More loggin	2022-07-11 18:42:27 +01:00
Peter Boyle	3440534fbf	MixedPrec support	2022-07-10 21:35:18 +01:00
Peter Boyle	177b1a7ec6	Mixed prec	2022-07-10 21:34:10 +01:00
Peter Boyle	58182fe345	Different approach to default dirichlet params	2022-07-10 21:32:58 +01:00
Peter Boyle	1f907d330d	Different default params for dirichlet	2022-07-10 21:31:48 +01:00
Peter Boyle	b0fe664e9d	Better force log info	2022-07-10 21:31:25 +01:00
Peter Boyle	c0f8482402	Remove SSC marks	2022-07-07 17:49:36 +01:00
Peter Boyle	3544965f54	Stream doesn't work	2022-07-07 17:49:20 +01:00