10TF/s on 32^3 x 64 on single node

Copy stream HIP improvements
Simplify dead code
2025-06-23 18:22:02 +01:00 · 2022-08-04 15:43:52 -04:00 · 2022-08-04 15:24:03 -04:00 · 2022-08-04 15:23:13 -04:00 · 2022-08-04 13:37:16 -04:00 · 2022-08-04 13:36:44 -04:00
34 changed files with 729 additions and 3380 deletions
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -117,6 +117,7 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;

+    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
@ -166,14 +167,16 @@ public:

      // Stopping condition
      if (cp <= rsq) {
+	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-
+	GridBase *grid = src.Grid();
+	RealD DwfFlops = (1452. )*grid->gSites()*4*k
+   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
-
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
@ -187,6 +190,8 @@ public:
 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

+	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
+
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

 	IterationsToComplete = k;	
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@ -146,21 +146,14 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
-
-  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
-                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
-                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
-                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
-                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3,
-					   int largestEvalIdxForReport=-1) 
+					   RealD coarse_relax_tol=5.0e3) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
+      _coarse_relax_tol(coarse_relax_tol)  
  {    };

  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
@ -186,12 +179,6 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;

-    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
-      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
-      RealD tmp_eval;
-      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
-    }
-    
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
@ -422,7 +409,7 @@ public:
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -40,7 +40,7 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -3,8 +3,14 @@

 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
-#define dprintf(...)
+
+#define MAXLINE 512
+static char print_buffer [ MAXLINE ];
+
+#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
+//#define dprintf(...) printf (__VA_ARGS__ ); fflush(stdout);
+#define dprintf(...) 
+


 ////////////////////////////////////////////////////////////
@ -104,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -126,7 +132,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
@ -150,7 +156,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -165,7 +171,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -107,6 +107,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
+  static void BarrierWorld(void);
  
  ////////////////////////////////////////////////////////////
  // Reduction
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -396,17 +396,17 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    }
  }
  
-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
-    this->StencilSendToRecvFromComplete(list,dir);
-    list.resize(0);
-  }
-
+  /*  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+   *    this->StencilSendToRecvFromComplete(list,dir);
+   *    list.resize(0);
+   *  }
+   */
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
+  StencilBarrier();// Synch shared memory on a single nodes

  int nreq=list.size();

@ -443,6 +443,10 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
+void CartesianCommunicator::BarrierWorld(void){
+  int ierr = MPI_Barrier(communicator_world);
+  assert(ierr==0);
+}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -104,6 +104,7 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@ -68,6 +68,7 @@ GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
+GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
@ -80,6 +81,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
+  GridLogDslash.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
@ -91,6 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@ -138,7 +138,8 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left;
+      //      stream << log.colour() <<  std::left;
+      stream <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
@ -153,9 +154,9 @@ public:
 	stream << log.evidence()
 	       << now	       << log.background() << " : " ;
      }
-      stream << log.colour();
+      //      stream << log.colour();
+      stream <<  std::right;
      stream.flags(f);
-
      return stream;
    } else { 
      return devnull;
@ -180,6 +181,7 @@ extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
+extern GridLogger GridLogDslash;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
--- a/Grid/perfmon/PerfCount.cc
+++ b/Grid/perfmon/PerfCount.cc
@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */

 #include <Grid/GridCore.h>
-#include <Grid/perfmon/PerfCount.h>

+#include <Grid/perfmon/Timer.h>
+#include <Grid/perfmon/PerfCount.h>
 NAMESPACE_BEGIN(Grid);

+GridTimePoint theProgramStart = GridClock::now();
+
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid)

-// Dress the output; use std::chrono
-// C++11 time facilities better?
-inline double usecond(void) {
-  struct timeval tv;
-  tv.tv_sec = 0;
-  tv.tv_usec = 0;
-  gettimeofday(&tv,NULL);
-  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
-}
-
-typedef  std::chrono::system_clock          GridClock;
+//typedef  std::chrono::system_clock          GridClock;
+typedef  std::chrono::high_resolution_clock   GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;

 typedef  std::chrono::seconds               GridSecs;
@ -53,6 +44,15 @@ typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;

+extern GridTimePoint theProgramStart;
+// Dress the output; use std::chrono
+// C++11 time facilities better?
+inline double usecond(void) {
+  auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart); 
+  return 1.0*usecs.count();
+}
+
+
 inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
  stream << time.count()<<" s";
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -42,6 +42,8 @@ public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
+  RealD Fdt_norm_sum;
+  RealD Fdt_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
@ -51,12 +53,17 @@ public:
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
  }
-  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
-  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
-  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
+  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
+    deriv_max_sum+=max; deriv_norm_sum+=nrm;
+    Fdt_max_sum+=Fdt_max; Fdt_norm_sum+=Fdt_nrm; deriv_num++;
+  }
+  RealD deriv_max_average(void)       { return deriv_max_sum/deriv_num; };
+  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
+  RealD Fdt_max_average(void)         { return Fdt_max_sum/deriv_num; };
+  RealD Fdt_norm_average(void)        { return Fdt_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
-  RealD S_timer(void)            { return deriv_us; };
-  RealD refresh_timer(void)      { return deriv_us; };
+  RealD S_timer(void)            { return S_us; };
+  RealD refresh_timer(void)      { return refresh_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -39,7 +39,7 @@ struct GparityWilsonImplParams {
  Coordinate twists;
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {};
+  GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); };
 };
  
 struct WilsonImplParams {
@ -48,13 +48,13 @@ struct WilsonImplParams {
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
  }
 };

@ -62,7 +62,7 @@ struct StaggeredImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  StaggeredImplParams()
  {
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
  };
 };
  
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -400,7 +400,6 @@ public:
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
-    accelerator_barrier();
  }

 };
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -233,10 +233,10 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
-    std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
+    std::cout << GridLogDslash << " Dirichlet BCs 5d " <<Block<<std::endl;
    Coordinate GaugeBlock(Nd);
    for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
-    std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
+    std::cout << GridLogDslash << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
    DirichletFilter<GaugeField> Filter(GaugeBlock);
    Filter.applyFilter(HUmu);
  }
@ -382,12 +382,14 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
-  DhopTotalTime-=usecond();
+  //  std::cout << GridLogDslash<<"Dhop internal"<<std::endl;
+  DhopTotalTime=-usecond();
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
+  //  std::cout << GridLogDslash<<"Dhop took"<<DhopTotalTime<<std::endl;
 }


@ -404,53 +406,59 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  DhopFaceTime=-usecond();
  st.HaloExchangeOptGather(in,compressor);
  DhopFaceTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Gather end "<< DhopFaceTime<<" us " <<std::endl;

-  DhopCommTime -=usecond();
+  DhopCommTime =-usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  DhopFaceTime=-usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Commsmerge end "<<DhopFaceTime<< " us "<<std::endl;
      
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
-  DhopComputeTime-=usecond();
+  DhopComputeTime=-usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Compute 1 end "<< DhopComputeTime<<" us" <<std::endl;

  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
  DhopCommTime   +=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Comunicate end "<< DhopCommTime << " us" <<std::endl;

  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  DhopFaceTime=-usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop CommsMerge2 end "<<DhopFaceTime << " us "<<std::endl;

-  DhopComputeTime2-=usecond();
+  DhopComputeTime2=-usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Ext end "<<DhopComputeTime2 <<"us  "<<std::endl;
 }


@ -463,12 +471,14 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
-  
-  DhopCommTime-=usecond();
+
+  //  std::cout << GridLogDslash<< " Dhop Halo exchange begine " <<std::endl;
+  DhopCommTime=-usecond();
  st.HaloExchangeOpt(in,compressor);
  DhopCommTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Comms end "<<DhopCommTime<<" us"<<std::endl;
  
-  DhopComputeTime-=usecond();
+  DhopComputeTime=-usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
@ -476,6 +486,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
+  //  std::cout << GridLogDslash<< " Dhop Compute end "<<DhopComputeTime<<" us" <<std::endl;
 }


--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -416,19 +416,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

-#define KERNEL_CALL_TMP(A) \
-  const uint64_t    NN = Nsite*Ls;					\
-  auto U_p = & U_v[0];							\
-  auto in_p = & in_v[0];						\
-  auto out_p = & out_v[0];						\
-  auto st_p = st_v._entries_p;						\
-  auto st_perm = st_v._permute_type;					\
-  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
-      int sF = ss;							\
-      int sU = ss/Ls;							\
-      WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p);	\
-    });									\
-  accelerator_barrier();

 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@ -448,8 +435,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
      int sF = ptr[ss];							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-    });									\
-  accelerator_barrier();
+    });									

 #define ASM_CALL(A)							\
  thread_for( ss, Nsite, {						\
@ -471,7 +457,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifdef SYCL_HACK     
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; }
 #else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #endif     
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@ -67,6 +67,36 @@ NAMESPACE_BEGIN(Grid);
      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
    };

+    template<class Impl,class ImplF>
+    class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
+    public:
+      typedef OneFlavourRationalParams Params;
+    private:
+      static RationalActionParams transcribe(const Params &in){
+	RationalActionParams out;
+	out.inv_pow = 2;
+	out.lo = in.lo;
+	out.hi = in.hi;
+	out.MaxIter = in.MaxIter;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
+	out.action_degree = out.md_degree = in.degree;
+	out.precision = in.precision;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
+	return out;
+      }
+
+    public:
+      OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+								 FermionOperator<Impl>  &_DenOp, 
+								 FermionOperator<ImplF>  &_NumOpF, 
+								 FermionOperator<ImplF>  &_DenOpF, 
+								 const Params & p, Integer ReliableUpdateFreq
+							) : 
+	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){}
+
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
+    };
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -153,7 +153,7 @@ protected:
      Real force_max   = std::sqrt(maxLocalNorm2(force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    

-      as[level].actions.at(a)->deriv_log(force_abs,force_max);
+      as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
      
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
@ -285,6 +285,8 @@ public:
 		  <<"["<<level<<"]["<< actionID<<"] : "
 		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
+		  <<" Fdt max  "  << as[level].actions.at(actionID)->Fdt_max_average()
+		  <<" norm "      << as[level].actions.at(actionID)->Fdt_norm_average()
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
--- a/Grid/qcd/observables/topological_charge.h
+++ b/Grid/qcd/observables/topological_charge.h
@ -99,7 +99,7 @@ public:
 	// using wilson flow by default here
 	WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 	WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
-	Real T0   = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
+	Real T0   = WF.energyDensityPlaquette(Usmear);
 	std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 		  << "T0                : [ " << traj << " ] "<< T0 << std::endl;
      }
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@ -7,7 +7,6 @@ Source file: ./lib/qcd/modules/plaquette.h
 Copyright (C) 2017

 Author: Guido Cossu <guido.cossu@ed.ac.uk>
-Author: Christopher Kelly <ckelly@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -34,44 +33,28 @@ NAMESPACE_BEGIN(Grid);

 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
-public:
-  //Store generic measurements to take during smearing process using std::function
-  typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
-  
-private:
  unsigned int Nstep;
-  RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step
- 
-  std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency
+  unsigned int measure_interval;
+  mutable RealD epsilon, taus;
+

  mutable WilsonGaugeAction<Gimpl> SG;

-  //Evolve the gauge field by 1 step and update tau
-  void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
-  //Evolve the gauge field by 1 step and update tau and the current time step eps
-  void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const;
+  void evolve_step(typename Gimpl::GaugeField&) const;
+  void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
+  RealD tau(unsigned int t)const {return epsilon*(t+1.0); }

 public:
  INHERIT_GIMPL_TYPES(Gimpl)

-  void resetActions(){ functions.clear(); }
-
-  void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
-
-  //Set the class to perform the default measurements: 
-  //the plaquette energy density every step
-  //the plaquette topological charge every 'topq_meas_interval' steps
-  //and output to stdout
-  void setDefaultMeasurements(int topq_meas_interval = 1);
-
  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
  Nstep(Nstep),
    epsilon(epsilon),
+    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
    assert(epsilon > 0.0);
    LogMessage();
-    setDefaultMeasurements(interval);
  }

  void LogMessage() {
@ -90,29 +73,9 @@ public:
    // undefined for WilsonFlow
  }

-  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const;
-
-  //Compute t^2 <E(t)> for time t from the plaquette
-  static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
-
-  //Compute t^2 <E(t)> for time t from the 1x1 cloverleaf form
-  //t is the Wilson flow time
-  static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U);
-  
-  //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps
-  //The smeared field is output as V
-  std::vector<RealD> flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1);
-
-  //Version that does not return the smeared field
-  std::vector<RealD> flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1);
-
-
-  //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps
-  //The smeared field is output as V
-  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1);
-
-  //Version that does not return the smeared field
-  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
+  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
+  RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
+  RealD energyDensityPlaquette(const GaugeField& U) const;
 };


@ -120,7 +83,7 @@ public:
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
+void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
  GaugeField Z(U.Grid());
  GaugeField tmp(U.Grid());
  SG.deriv(U, Z);
@ -136,13 +99,12 @@ void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) c
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
-  tau += epsilon;
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{
-  if (maxTau - tau < eps){
-    eps = maxTau-tau;
+void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
+  if (maxTau - taus < epsilon){
+    epsilon = maxTau-taus;
  }
  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
@ -152,151 +114,95 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-  Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0
+  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0

  Z *= -17.0/8.0;
  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-  Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
+  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
    

  Z *= -4.0/3.0;
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-  Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2
+  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2

  // Ramos 
-  Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
+  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
  RealD diff = norm2(diffU);
  // adjust integration step
    
-  tau += eps;
+  taus += epsilon;
  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    
-  eps = eps*0.95*std::pow(1e-4/diff,1./3.);
+  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;

 }

-
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
-  static WilsonGaugeAction<Gimpl> SG(3.0);
-  return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
-}
-
-//Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
-template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
-  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField GaugeLorentz;
-
-  assert(Nd == 4);
-  //E = 1/2 tr( F_munu F_munu )
-  //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
-  //F_01 F_02 F_03   F_12 F_13  F_23
-  GaugeMat F(U.Grid());
-  LatticeComplexD R(U.Grid());
-  R = Zero();
-  
-  for(int mu=0;mu<3;mu++){
-    for(int nu=mu+1;nu<4;nu++){
-      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
-      R = R + trace(F*F);
-    }
-  }
-  ComplexD out = sum(R);
-  out = t*t*out / RealD(U.Grid()->gSites());
-  return -real(out); //minus sign necessary for +ve energy
-}
-
-
-template <class Gimpl>
-std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
-  std::vector<RealD> out;
-  resetActions();
-  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
-      std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl;
-      out.push_back( energyDensityPlaquette(t,U) );
-    });      
-  smear(V,U);
-  return out;
+RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
+  RealD td = tau(step);
+  return 2.0 * td * td * SG.S(U)/U.Grid()->gSites();
 }

 template <class Gimpl>
-std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
-  GaugeField V(U);
-  return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
+RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
+  return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites();
 }

-template <class Gimpl>
-std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
-  std::vector<RealD> out;
-  resetActions();
-  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
-      std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl;
-      out.push_back( energyDensityCloverleaf(t,U) );
-    });      
-  smear(V,U);
-  return out;
-}
-
-template <class Gimpl>
-std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
-  GaugeField V(U);
-  return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
-}
-
-

 //#define WF_TIMING 
+
+
+
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
+void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
  out = in;
-  RealD taus = 0.;
-  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
+  for (unsigned int step = 1; step <= Nstep; step++) {
    auto start = std::chrono::high_resolution_clock::now();
-    evolve_step(out, taus);
+    evolve_step(out);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
 #ifdef WF_TIMING
    std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
-    //Perform measurements
-    for(auto const &meas : functions)
-      if( step % meas.first == 0 ) meas.second(step,taus,out);
+    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+		  << step << "  " << tau(step) << "  " 
+	      << energyDensityPlaquette(step,out) << std::endl;
+    if( step % measure_interval == 0){
+      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
+		<< step << "  " 
+		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
+    }
  }
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{
+void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
  out = in;
-  RealD taus = 0.;
-  RealD eps = epsilon;
+  taus = epsilon;
  unsigned int step = 0;
  do{
    step++;
    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
-    evolve_step_adaptive(out, taus, eps, maxTau);
-    //Perform measurements
-    for(auto const &meas : functions)
-      if( step % meas.first == 0 ) meas.second(step,taus,out);
+    evolve_step_adaptive(out, maxTau);
+    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+		  << step << "  " << taus << "  "
+	      << energyDensityPlaquette(out) << std::endl;
+    if( step % measure_interval == 0){
+      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
+		<< step << "  " 
+		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
+    }
  } while (taus < maxTau);
-}

-template <class Gimpl>
-void WilsonFlow<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
-  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
-      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
-    });
-  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
-      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
-    });
-}


+}
+
 NAMESPACE_END(Grid);

--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -290,6 +290,8 @@ public:
  std::vector<Decompress> DecompressionsSHM;
  std::vector<CopyReceiveBuffer> CopyReceiveBuffers ;
  std::vector<CachedTransfer> CachedTransfers;
+  std::vector<CommsRequest_t> MpiReqs;
+  
  ///////////////////////////////////////////////////////////
  // Unified Comms buffers for all directions
  ///////////////////////////////////////////////////////////
@ -357,9 +359,9 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    reqs.resize(Packets.size());
+    accelerator_barrier();
    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromBegin(reqs[i],
+      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].recv_buf,
@ -370,41 +372,19 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromComplete(reqs[i],i);
-    }
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
-      /////////////////////////////////////////////////////////
-      // several way threaded on different communicators.
-      // Cannot combine with Dirichlet operators
-      // This scheme is needed on Intel Omnipath for best performance
-      // Deprecate once there are very few omnipath clusters
-      /////////////////////////////////////////////////////////
-      int nthreads = CartesianCommunicator::nCommThreads;
-      int old = GridThread::GetThreads();
-      GridThread::SetThreads(nthreads);
-      thread_for(i,Packets.size(),{
-	  _grid->StencilSendToRecvFrom(Packets[i].send_buf,
-				       Packets[i].to_rank,Packets[i].do_send,
-				       Packets[i].recv_buf,
-				       Packets[i].from_rank,Packets[i].do_recv,
-				       Packets[i].bytes,i);
-      });
-      GridThread::SetThreads(old);
-    } else { 
-      /////////////////////////////////////////////////////////
-      // Concurrent and non-threaded asynch calls to MPI
-      /////////////////////////////////////////////////////////
-      std::vector<std::vector<CommsRequest_t> > reqs;
-      this->CommunicateBegin(reqs);
-      this->CommunicateComplete(reqs);
-    }
+    /////////////////////////////////////////////////////////
+    // Concurrent and non-threaded asynch calls to MPI
+    /////////////////////////////////////////////////////////
+    std::vector<std::vector<CommsRequest_t> > reqs;
+    this->CommunicateBegin(reqs);
+    this->CommunicateComplete(reqs);
  }

  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
@ -484,7 +464,6 @@ public:
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);

-    accelerator_barrier();
  }

  /////////////////////////
@ -499,6 +478,7 @@ public:
    Packets.resize(0);
    CopyReceiveBuffers.resize(0);
    CachedTransfers.resize(0);
+    MpiReqs.resize(0);
  }
  void AddCopy(void *from,void * to, Integer bytes)
  {
@ -711,7 +691,9 @@ public:
    this->_comms_recv.resize(npoints); 
    this->same_node.resize(npoints);

-    if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up
+    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
+
+    DirichletBlock(p.dirichlet); // comms send/recv set up

    _unified_buffer_size=0;
    surface_list.resize(0);
@ -793,7 +775,6 @@ public:
      u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
      u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    }
-
    PrecomputeByteOffsets();
  }

@ -1105,7 +1086,6 @@ public:
 	  // Gather locally
 	  ////////////////////////////////////////////////////////
 	  assert(send_buf!=NULL);
-
 	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
 	}

@ -1212,8 +1192,9 @@ public:
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
 	}

-	if ( comms_send || comms_recv )
+	if ( comms_send || comms_recv ) {
 	  Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
+	}
 	face_idx++;

 	//spointers[0] -- low
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@ -208,46 +208,5 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 }


-
-//////////////////////////////////////////////////////////////////////////////////
-//Copy a single lane of a SIMD tensor type from one object to another
-//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
-///////////////////////////////////////////////////////////////////////////////////
-template<class vobjOut, class vobjIn>
-accelerator_inline 
-void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
-{
-  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
-
-  typedef typename vobjOut::vector_type ovector_type;  
-  typedef typename vobjIn::vector_type ivector_type;  
-  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
-  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
-  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
-
-  typedef typename vobjOut::scalar_type oscalar_type;  
-  typedef typename vobjIn::scalar_type iscalar_type;  
-  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
-  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
-
-  typedef oextract_type * opointer;
-  typedef iextract_type * ipointer;
-
-  constexpr int oNsimd=ovector_type::Nsimd();
-  constexpr int iNsimd=ivector_type::Nsimd();
-
-  iscalar_type itmp;
-  oscalar_type otmp;
-
-  opointer __restrict__  op = (opointer)&vecOut;
-  ipointer __restrict__  ip = (ipointer)&vecIn;
-  for(int w=0;w<owords;w++){
-    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
-    otmp = itmp; //potential precision change
-    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
-  }
-}
-
-
 NAMESPACE_END(Grid);

--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -1,6 +1,7 @@
 #include <Grid/GridCore.h>

 NAMESPACE_BEGIN(Grid);
+int      world_rank; // Use to control world rank for print guarding
 int      acceleratorAbortOnGpuError=1;
 uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
@ -16,7 +17,7 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
-cudaStream_t cpuStream;
+cudaStream_t computeStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@ -24,7 +25,8 @@ void acceleratorInit(void)
  gpu_props = new cudaDeviceProp[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_SLURM  )) != NULL) { world_rank = atoi(localRankStr);}
@ -99,7 +101,7 @@ void acceleratorInit(void)

  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
-  cudaStreamCreate(&cpuStream);
+  cudaStreamCreate(&computeStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@ -114,7 +116,7 @@ void acceleratorInit(void)
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
-hipStream_t cpuStream;
+hipStream_t computeStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@ -122,7 +124,8 @@ void acceleratorInit(void)
  gpu_props = new hipDeviceProp_t[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
@ -183,7 +186,7 @@ void acceleratorInit(void)
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
-  hipStreamCreate(&cpuStream);
+  hipStreamCreate(&computeStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@ -210,7 +213,8 @@ void acceleratorInit(void)
 #endif
  
  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 

  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -107,7 +107,7 @@ void     acceleratorInit(void);

 extern int acceleratorAbortOnGpuError;
 extern cudaStream_t copyStream;
-extern cudaStream_t cpuStream;
+extern cudaStream_t computeStream;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@ -135,7 +135,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda);	\
+    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
  }

 #define accelerator_for6dNB(iter1, num1,				\
@ -154,7 +154,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_blocks (num1,num2,num3);					\
    dim3 cu_threads(num4,num5,num6);					\
-    Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
+    Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
  }

 template<typename lambda>  __global__
@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,

 #define accelerator_barrier(dummy)					\
  {									\
-    cudaStreamSynchronize(cpuStream);					\
+    cudaStreamSynchronize(computeStream);				\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
      printf("accelerator_barrier(): Cuda error %s \n",			\
@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator_inline __host__ __device__ inline

 extern hipStream_t copyStream;
-extern hipStream_t cpuStream;
+extern hipStream_t computeStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@ -362,16 +362,15 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-			 0,cpuStream,					\
+   	                 0,computeStream,						\
 			 num1,num2,nsimd, lambda);			\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
-			 0,cpuStream,					\
+			 0,computeStream,				\
 			 num1,num2,nsimd, lambda);			\
    } \
  }

-
 template<typename lambda>  __global__
 __launch_bounds__(64,1)
 void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
@ -400,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 #define accelerator_barrier(dummy)				\
  {								\
-    hipStreamSynchronize(cpuStream);					\
+    hipStreamSynchronize(computeStream);			\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@ -443,7 +442,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);
+  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
 inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -356,6 +356,11 @@ void Grid_init(int *argc,char ***argv)
  //////////////////////////////////////////////////////////
  CartesianCommunicator::Init(argc,argv);

+  GridLogger::GlobalStopWatch.Stop();
+  CartesianCommunicator::BarrierWorld();
+  GridLogger::GlobalStopWatch.Reset();// Back to zero with synchronised clock
+  GridLogger::GlobalStopWatch.Start();
+
  ////////////////////////////////////
  // Banner after MPI (unless GPU)
  ////////////////////////////////////
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
@ -1,918 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
-
-Copyright (C) 2015-2016
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace Grid;
-
-//Production binary for the 40ID G-parity ensemble
-
-struct RatQuoParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
-				  double, bnd_lo,
-				  double, bnd_hi,
-				  Integer, action_degree,
-				  double, action_tolerance,
-				  Integer, md_degree,
-				  double, md_tolerance,
-				  Integer, reliable_update_freq,
-				  Integer, bnd_check_freq);
-  RatQuoParameters() { 
-    bnd_lo = 1e-2;
-    bnd_hi = 30;
-    action_degree = 10;
-    action_tolerance = 1e-10;
-    md_degree = 10;
-    md_tolerance = 1e-8;
-    bnd_check_freq = 20;
-    reliable_update_freq = 50;
-  }
-
-  void Export(RationalActionParams &into) const{
-    into.lo = bnd_lo;
-    into.hi = bnd_hi;
-    into.action_degree = action_degree;
-    into.action_tolerance = action_tolerance;
-    into.md_degree = md_degree;
-    into.md_tolerance = md_tolerance;
-    into.BoundsCheckFreq = bnd_check_freq;
-  }
-};
-
-struct EOFAparameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
-				  OneFlavourRationalParams, rat_params,
-				  double, action_tolerance,
-				  double, action_mixcg_inner_tolerance,
-				  double, md_tolerance,
-				  double, md_mixcg_inner_tolerance);
-
-  EOFAparameters() { 
-    action_mixcg_inner_tolerance = 1e-8;
-    action_tolerance = 1e-10;
-    md_tolerance = 1e-8;
-    md_mixcg_inner_tolerance = 1e-8;
-
-    rat_params.lo = 1.0;
-    rat_params.hi = 25.0;
-    rat_params.MaxIter  = 50000;
-    rat_params.tolerance= 1.0e-9;
-    rat_params.degree   = 14;
-    rat_params.precision= 50;
-  }
-};
-
-struct EvolParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
-                                  Integer, StartTrajectory,
-                                  Integer, Trajectories,
-				  Integer, SaveInterval,
-				  Integer, Steps,
-				  RealD, TrajectoryLength,
-                                  bool, MetropolisTest,
-				  std::string, StartingType,
-				  std::vector<Integer>, GparityDirs,
-				  std::vector<EOFAparameters>, eofa_l,
-				  RatQuoParameters, rat_quo_s,
-				  RatQuoParameters, rat_quo_DSDR);
-
-  EvolParameters() {
-    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
-    MetropolisTest    = false;
-    StartTrajectory   = 0;
-    Trajectories      = 50;
-    SaveInterval = 5;
-    StartingType      = "ColdStart";
-    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
-    Steps = 5;
-    TrajectoryLength = 1.0;
-  }
-};
-
-bool fileExists(const std::string &fn){
-  std::ifstream f(fn);
-  return f.good();
-}
-
-
-
-
-struct LanczosParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
-				  double, alpha,
-				  double, beta,
-				  double, mu,
-				  int, ord,
-				  int, n_stop,
-				  int, n_want,
-				  int, n_use,
-				  double, tolerance);
-
-  LanczosParameters() {
-    alpha = 35;
-    beta = 5;
-    mu = 0;
-    ord = 100;
-    n_stop = 10;
-    n_want = 10;
-    n_use = 15;
-    tolerance = 1e-6;
-  }
-};
-
-
-
-template<typename FermionActionD, typename FermionFieldD>
-void computeEigenvalues(std::string param_file,
-			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
-			FermionActionD &action, GridParallelRNG &rng){
-  
-  LanczosParameters params;
-  if(fileExists(param_file)){
-    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
-    Grid::XmlReader rd(param_file);
-    read(rd, "LanczosParameters", params);
-  }else if(!GlobalSharedMemory::WorldRank){
-    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
-    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
-    Grid::XmlWriter wr(param_file + ".templ");
-    write(wr, "LanczosParameters", params);
-  }
-
-  FermionFieldD gauss_o(rbGrid);
-  FermionFieldD gauss(Grid);
-  gaussian(rng, gauss);
-  pickCheckerboard(Odd, gauss_o, gauss);
-
-  action.ImportGauge(latt);
-
-  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
-  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
-  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
-  assert(params.mu == 0.0);
-
-  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
-  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
-
-  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
-  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 50000);
-
-  std::vector<RealD> eval(params.n_use);
-  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
-  int Nconv;
-  IRL.calc(eval, evec, gauss_o, Nconv);
-
-  std::cout << "Eigenvalues:" << std::endl;
-  for(int i=0;i<params.n_want;i++){
-    std::cout << i << " " << eval[i] << std::endl;
-  }
-}
-
-
-//Check the quality of the RHMC approx
-//action_or_md toggles checking the action (0), MD (1) or both (2) setups
-template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
-void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
-	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
-	       int inv_pow, const std::string &quark_descr, int action_or_md){
-  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
-  
-  FermionFieldD gauss_o(rbGrid);
-  FermionFieldD gauss(Grid);
-  gaussian(rng, gauss);
-  pickCheckerboard(Odd, gauss_o, gauss);
-
-  numOp.ImportGauge(latt);
-  denOp.ImportGauge(latt);
-
-  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
-  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
-  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
-
-  PowerMethod<FermionFieldD> power_method;
-  RealD lambda_max;
-
-  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
-
-  lambda_max = power_method(MdagM,gauss_o);
-  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
-
-  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
-  lambda_max = power_method(VdagV,gauss_o);
-  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
-
-  if(action_or_md == 0 || action_or_md == 2){
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-  }
-
-  std::cout << "-------------------------------------------------------------------------------" << std::endl;
-
-  if(action_or_md == 1 || action_or_md == 2){
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-  }
-}
-
-
-template<typename FermionImplPolicy>
-void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
-	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
-  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
-  typename FermionImplPolicy::FermionField eta(FGrid);
-  RealD scale = std::sqrt(0.5);
-  gaussian(rng,eta); eta = eta * scale;
-
-  //Use the inbuilt check
-  EOFA.refresh(latt, eta);
-  EOFA.S(latt);
-  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
-}
-
-
-template<typename FermionImplPolicy>
-class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
-  LatticeGaugeFieldD &U;
-public:
-  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
-
-  typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
-
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
-  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
-};
-
-template<typename FermionImplPolicy>
-void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
-		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
-  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
-  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
-  typename FermionImplPolicy::FermionField eta(FGrid);
-  gaussian(rng,eta);
-  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
-  auto lambda_max = power_method(linop,eta);
-  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
-}
-
-//Applications of M^{-1} cost the same as M for EOFA!
-template<typename FermionImplPolicy>
-class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
-  LatticeGaugeFieldD &U;
-public:
-  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
-
-  typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
-
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
-  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
-};
-
-template<typename FermionImplPolicy>
-void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
-		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
-  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
-  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
-  typename FermionImplPolicy::FermionField eta(FGrid);
-  gaussian(rng,eta);
-  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
-  auto lambda_max = power_method(linop,eta);
-  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
-}
-
-
-NAMESPACE_BEGIN(Grid);
-
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
-  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-  public:
-    typedef typename FermionOperatorD::FermionField FieldD;
-    typedef typename FermionOperatorF::FermionField FieldF;
-
-    using OperatorFunction<FieldD>::operator();
-
-    RealD   Tolerance;
-    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-    Integer MaxInnerIterations;
-    Integer MaxOuterIterations;
-    GridBase* SinglePrecGrid4; //Grid for single-precision fields
-    GridBase* SinglePrecGrid5; //Grid for single-precision fields
-    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
-
-    FermionOperatorF &FermOpF;
-    FermionOperatorD &FermOpD;;
-    SchurOperatorF &LinOpF;
-    SchurOperatorD &LinOpD;
-
-    Integer TotalInnerIterations; //Number of inner CG iterations
-    Integer TotalOuterIterations; //Number of restarts
-    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
-
-    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
-						    Integer maxinnerit, 
-						    Integer maxouterit, 
-						    GridBase* _sp_grid4, 
-						    GridBase* _sp_grid5, 
-						    FermionOperatorF &_FermOpF,
-						    FermionOperatorD &_FermOpD,
-						    SchurOperatorF   &_LinOpF,
-						    SchurOperatorD   &_LinOpD): 
-      LinOpF(_LinOpF),
-      LinOpD(_LinOpD),
-      FermOpF(_FermOpF),
-      FermOpD(_FermOpD),
-      Tolerance(tol), 
-      InnerTolerance(tol), 
-      MaxInnerIterations(maxinnerit), 
-      MaxOuterIterations(maxouterit), 
-      SinglePrecGrid4(_sp_grid4),
-      SinglePrecGrid5(_sp_grid5),
-      OuterLoopNormMult(100.) 
-    { 
-    };
-
-    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
-
-      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
-
-      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
-
-      precisionChange(FermOpF.Umu, FermOpD.Umu);
-
-      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Make a mixed precision conjugate gradient
-      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
-      MPCG.InnerTolerance = InnerTolerance;
-      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
-      MPCG(src,psi);
-    }
-  };
-
-
-
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
-  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-  public:
-    typedef typename FermionOperatorD::FermionField FieldD;
-    typedef typename FermionOperatorF::FermionField FieldF;
-
-    using OperatorFunction<FieldD>::operator();
-
-    RealD Tolerance;
-    Integer MaxIterations;
-
-    RealD Delta; //reliable update parameter
-
-    GridBase* SinglePrecGrid4; //Grid for single-precision fields
-    GridBase* SinglePrecGrid5; //Grid for single-precision fields
-
-    FermionOperatorF &FermOpF;
-    FermionOperatorD &FermOpD;;
-    SchurOperatorF &LinOpF;
-    SchurOperatorD &LinOpD;
-    
-    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
-								  RealD delta,
-								  Integer maxit, 
-								  GridBase* _sp_grid4, 
-								  GridBase* _sp_grid5, 
-								  FermionOperatorF &_FermOpF,
-								  FermionOperatorD &_FermOpD,
-								  SchurOperatorF   &_LinOpF,
-								  SchurOperatorD   &_LinOpD): 
-      LinOpF(_LinOpF),
-      LinOpD(_LinOpD),
-      FermOpF(_FermOpF),
-      FermOpD(_FermOpD),
-      Tolerance(tol), 
-      Delta(delta),
-      MaxIterations(maxit), 
-      SinglePrecGrid4(_sp_grid4),
-      SinglePrecGrid5(_sp_grid5)
-    { 
-    };
-
-    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
-
-      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
-
-      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
-
-      precisionChange(FermOpF.Umu, FermOpD.Umu);
-
-      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Make a mixed precision conjugate gradient
-      ////////////////////////////////////////////////////////////////////////////////////
-
-      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
-      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
-      MPCG(src,psi);
-    }
-  };
-
-
-
-NAMESPACE_END(Grid);
-
-
-
-
-
-int main(int argc, char **argv) {
-  Grid_init(&argc, &argv);
-  int threads = GridThread::GetThreads();
-  // here make a routine to print all the relevant information on the run
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
-
-  std::string param_file = "params.xml";
-  bool file_load_check = false;
-
-  std::string serial_seeds = "1 2 3 4 5";
-  std::string parallel_seeds = "6 7 8 9 10";
-
-  int i=1;
-  while(i < argc){
-    std::string sarg(argv[i]);
-    if(sarg == "--param_file"){
-      assert(i!=argc-1);
-      param_file = argv[i+1];
-      i+=2;
-    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
-      file_load_check = true;
-      i++;
-    }else if(sarg == "--set_seeds"){ //set the rng seeds. Expects two vector args, e.g.  --set_seeds 1.2.3.4 5.6.7.8
-      assert(i < argc-2);
-      std::vector<int> tmp;
-      GridCmdOptionIntVector(argv[i+1],tmp);
-      {
-	std::stringstream ss;
-	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
-	ss << tmp.back();
-	serial_seeds = ss.str();
-      }
-      GridCmdOptionIntVector(argv[i+2],tmp);
-      {
-	std::stringstream ss;
-	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
-	ss << tmp.back();
-	parallel_seeds = ss.str();
-      }
-      i+=3;
-      std::cout << GridLogMessage << "Set serial seeds to " << serial_seeds << std::endl;
-      std::cout << GridLogMessage << "Set parallel seeds to " << parallel_seeds << std::endl;
-      
-    }else{
-      i++;
-    }
-  }
-
-  
-  //Read the user parameters
-  EvolParameters user_params;
-  
-  if(fileExists(param_file)){
-    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
-    Grid::XmlReader rd(param_file);
-    read(rd, "Params", user_params);
-  }else if(!GlobalSharedMemory::WorldRank){
-    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
-    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
-    {
-      Grid::XmlWriter wr(param_file + ".templ");
-      write(wr, "Params", user_params);
-    }
-    std::cout << GridLogMessage << " Done" << std::endl;
-    Grid_finalize();
-    return 0;
-  }
-
-  //Check the parameters
-  if(user_params.GparityDirs.size() != Nd-1){
-    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
-    exit(1);
-  }
-  for(int i=0;i<Nd-1;i++)
-    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
-      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
-      exit(1);
-    }
-
-
-  typedef GparityMobiusEOFAFermionD EOFAactionD;
-  typedef GparityMobiusFermionD FermionActionD;
-  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
-  typedef typename FermionActionD::FermionField FermionFieldD;
-
-  typedef GparityMobiusEOFAFermionF EOFAactionF;
-  typedef GparityMobiusFermionF FermionActionF;
-  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
-  typedef typename FermionActionF::FermionField FermionFieldF;
-
-  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
-  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
-
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-  IntegratorParameters MD;
-  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
-  MD.name    = std::string("MinimumNorm2");
-
-  // typedef ConjugateHMCRunnerD<ForceGradient> HMCWrapper;
-  // MD.name    = std::string("ForceGradient");
-  
-  MD.MDsteps = user_params.Steps;
-  MD.trajL   = user_params.TrajectoryLength;
-
-  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
-  
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = user_params.StartTrajectory;
-  HMCparams.Trajectories     = user_params.Trajectories;
-  HMCparams.NoMetropolisUntil= 0;
-  HMCparams.StartingType     = user_params.StartingType;
-  HMCparams.MetropolisTest = user_params.MetropolisTest;
-  HMCparams.MD = MD;
-  HMCWrapper TheHMC(HMCparams);
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_lat";
-  CPparams.rng_prefix    = "ckpoint_rng";
-  CPparams.saveInterval  = user_params.SaveInterval;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
-  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = serial_seeds;
-  RNGpar.parallel_seeds = parallel_seeds;
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-  //////////////////////////////////////////////
-  //aiming for ainv=1.723 GeV
-  //                                  me         bob
-  //Estimated  a(ml+mres) [40ID] = 0.001305    0.00131
-  //           a(mh+mres) [40ID] = 0.035910    0.03529
-  //Estimate Ls=12, b+c=2  mres~0.0011
-
-  //1/24/2022 initial mres measurement gives mres=0.001,  adjusted light quark mass to 0.0003 from 0.0001
-  
-  const int Ls      = 12;
-  Real beta         = 1.848;
-  Real light_mass   = 0.0003;
-  Real strange_mass = 0.0342;
-  Real pv_mass      = 1.0;
-  RealD M5  = 1.8;
-  RealD mobius_scale = 2.; //b+c
-
-  RealD mob_bmc = 1.0;
-  RealD mob_b = (mobius_scale + mob_bmc)/2.;
-  RealD mob_c = (mobius_scale - mob_bmc)/2.;
-
-  std::cout << GridLogMessage
-	    << "Ensemble parameters:" << std::endl
-	    << "Ls=" << Ls << std::endl
-	    << "beta=" << beta << std::endl
-	    << "light_mass=" << light_mass << std::endl
-	    << "strange_mass=" << strange_mass << std::endl
-	    << "mobius_scale=" << mobius_scale << std::endl;
-  
-  //Setup the Grids
-  auto UGridD   = TheHMC.Resources.GetCartesian();
-  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
-  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
-  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
-
-  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
-  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
-  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
-  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
-
-  ConjugateIwasakiGaugeActionD GaugeAction(beta);
-
-  // temporarily need a gauge field
-  LatticeGaugeFieldD Ud(UGridD);
-  LatticeGaugeFieldF Uf(UGridF);
- 
-  //Setup the BCs
-  FermionActionD::ImplParams Params;
-  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
-  Params.twists[Nd-1] = 1; //APBC in time direction
-
-  std::vector<int> dirs4(Nd);
-  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
-  dirs4[Nd-1] = 0; //periodic gauge BC in time
-
-  GaugeImplPolicy::setDirections(dirs4); //gauge BC
-
-  //Run optional gauge field checksum checker and exit
-  if(file_load_check){
-    TheHMC.initializeGaugeFieldAndRNGs(Ud);
-    std::cout << GridLogMessage << " Done" << std::endl;
-    Grid_finalize();
-    return 0;
-  }
-
-
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
-  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
-  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
-
-
-  /////////////////////////////////////////////////////////////
-  // Light EOFA action
-  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
-  /////////////////////////////////////////////////////////////
-  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
-  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
-  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
-  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
-  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
-
-
-  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
-  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
-  int n_light_hsb = 5;
-  assert(user_params.eofa_l.size() == n_light_hsb);
-  
-  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
-
-  for(int i=0;i<n_light_hsb;i++){
-    RealD iml = eofa_light_masses[i];
-    RealD ipv = eofa_pv_masses[i];
-
-    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
-    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
-    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
-    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
-
-    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
-    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
-    
-    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
-    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
-
-#if 1
-    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
-    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-
-    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-
-#else
-    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
-    
-    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
-    
-    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
-    
-    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
-    
-    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
-    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
-#endif
-
-    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
-							*LopD, *RopD, 
-							*ActionMCG_L, *ActionMCG_R, 
-							*ActionMCG_L, *ActionMCG_R, 
-							*DerivMCG_L, *DerivMCG_R, 
-							user_params.eofa_l[i].rat_params, true);
-    EOFA_pfactions[i] = EOFA;
-    Level1.push_back(EOFA);
-  }
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
-  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
-
-  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
-  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
-
-  RationalActionParams rat_act_params_s;
-  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
-  rat_act_params_s.precision= 60;
-  rat_act_params_s.MaxIter  = 50000;
-  user_params.rat_quo_s.Export(rat_act_params_s);
-  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
-
-  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
-  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
-  Level1.push_back(&Quotient_s);  
-
-  ///////////////////////////////////
-  // DSDR action
-  ///////////////////////////////////
-  RealD dsdr_mass=-1.8;   
-  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
-  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
-  RealD dsdr_epsilon_b = 0.5; 
-  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
-  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
-
-  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
-  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
- 
-  RationalActionParams rat_act_params_DSDR;
-  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
-  rat_act_params_DSDR.precision= 60;
-  rat_act_params_DSDR.MaxIter  = 50000;
-  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
-  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
-
-  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
-  Level2.push_back(&Quotient_DSDR);
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  Level3.push_back(&GaugeAction);
-
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
-  std::cout << GridLogMessage << " Action complete "<< std::endl;
-
-
-  //Action tuning
-  bool 
-    tune_rhmc_s=false, eigenrange_s=false, 
-    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
-    check_eofa=false, 
-    upper_bound_eofa=false, lower_bound_eofa(false);
-
-  std::string lanc_params_s;
-  std::string lanc_params_DSDR;
-  int tune_rhmc_s_action_or_md;
-  int tune_rhmc_DSDR_action_or_md;
-  int eofa_which_hsb;
-
-  for(int i=1;i<argc;i++){
-    std::string sarg(argv[i]);
-    if(sarg == "--tune_rhmc_s"){
-      assert(i < argc-1);
-      tune_rhmc_s=true;
-      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
-    }
-    else if(sarg == "--eigenrange_s"){
-      assert(i < argc-1);
-      eigenrange_s=true;
-      lanc_params_s = argv[i+1];
-    }
-    else if(sarg == "--tune_rhmc_DSDR"){
-      assert(i < argc-1);
-      tune_rhmc_DSDR=true;
-      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
-    }
-    else if(sarg == "--eigenrange_DSDR"){
-      assert(i < argc-1);
-      eigenrange_DSDR=true;
-      lanc_params_DSDR = argv[i+1];
-    }
-    else if(sarg == "--check_eofa"){
-      assert(i < argc-1);
-      check_eofa = true;
-      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
-      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
-    }
-    else if(sarg == "--upper_bound_eofa"){
-      assert(i < argc-1);
-      upper_bound_eofa = true;
-      eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
-    }
-    else if(sarg == "--lower_bound_eofa"){
-      assert(i < argc-1);
-      lower_bound_eofa = true;      
-      eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
-    }
-  }
-  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
-    std::cout << GridLogMessage << "Running checks" << std::endl;
-    TheHMC.initializeGaugeFieldAndRNGs(Ud);
-
-    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
-    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
-
-    if(check_eofa){
-      if(eofa_which_hsb >= 0){
-	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
-	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
-      }else{
-	for(int i=0;i<n_light_hsb;i++){
-	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
-	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
-	}
-      }
-    }	  
-    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
-    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
-    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
-    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
-
-
-    std::cout << GridLogMessage << " Done" << std::endl;
-    Grid_finalize();
-    return 0;
-  }
-
-
-  //Run the HMC
-  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
-  TheHMC.Run();
-
-  std::cout << GridLogMessage << " Done" << std::endl;
-  Grid_finalize();
-  return 0;
-} // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
@ -1,873 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
-
-Copyright (C) 2015-2016
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace Grid;
-
-//Production binary for the 40ID G-parity ensemble
-
-struct RatQuoParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
-				  double, bnd_lo,
-				  double, bnd_hi,
-				  Integer, action_degree,
-				  double, action_tolerance,
-				  Integer, md_degree,
-				  double, md_tolerance,
-				  Integer, reliable_update_freq,
-				  Integer, bnd_check_freq);
-  RatQuoParameters() { 
-    bnd_lo = 1e-2;
-    bnd_hi = 30;
-    action_degree = 10;
-    action_tolerance = 1e-10;
-    md_degree = 10;
-    md_tolerance = 1e-8;
-    bnd_check_freq = 20;
-    reliable_update_freq = 50;
-  }
-
-  void Export(RationalActionParams &into) const{
-    into.lo = bnd_lo;
-    into.hi = bnd_hi;
-    into.action_degree = action_degree;
-    into.action_tolerance = action_tolerance;
-    into.md_degree = md_degree;
-    into.md_tolerance = md_tolerance;
-    into.BoundsCheckFreq = bnd_check_freq;
-  }
-};
-
-struct EOFAparameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
-				  OneFlavourRationalParams, rat_params,
-				  double, action_tolerance,
-				  double, action_mixcg_inner_tolerance,
-				  double, md_tolerance,
-				  double, md_mixcg_inner_tolerance);
-
-  EOFAparameters() { 
-    action_mixcg_inner_tolerance = 1e-8;
-    action_tolerance = 1e-10;
-    md_tolerance = 1e-8;
-    md_mixcg_inner_tolerance = 1e-8;
-
-    rat_params.lo = 1.0;
-    rat_params.hi = 25.0;
-    rat_params.MaxIter  = 10000;
-    rat_params.tolerance= 1.0e-9;
-    rat_params.degree   = 14;
-    rat_params.precision= 50;
-  }
-};
-
-struct EvolParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
-                                  Integer, StartTrajectory,
-                                  Integer, Trajectories,
-				  Integer, SaveInterval,
-				  Integer, Steps,
-				  RealD, TrajectoryLength,
-                                  bool, MetropolisTest,
-				  std::string, StartingType,
-				  std::vector<Integer>, GparityDirs,
-				  std::vector<EOFAparameters>, eofa_l,
-				  RatQuoParameters, rat_quo_s,
-				  RatQuoParameters, rat_quo_DSDR);
-
-  EvolParameters() {
-    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
-    MetropolisTest    = false;
-    StartTrajectory   = 0;
-    Trajectories      = 50;
-    SaveInterval = 5;
-    StartingType      = "ColdStart";
-    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
-    Steps = 5;
-    TrajectoryLength = 1.0;
-  }
-};
-
-bool fileExists(const std::string &fn){
-  std::ifstream f(fn);
-  return f.good();
-}
-
-
-
-
-struct LanczosParameters: Serializable {
-  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
-				  double, alpha,
-				  double, beta,
-				  double, mu,
-				  int, ord,
-				  int, n_stop,
-				  int, n_want,
-				  int, n_use,
-				  double, tolerance);
-
-  LanczosParameters() {
-    alpha = 35;
-    beta = 5;
-    mu = 0;
-    ord = 100;
-    n_stop = 10;
-    n_want = 10;
-    n_use = 15;
-    tolerance = 1e-6;
-  }
-};
-
-
-
-template<typename FermionActionD, typename FermionFieldD>
-void computeEigenvalues(std::string param_file,
-			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
-			FermionActionD &action, GridParallelRNG &rng){
-  
-  LanczosParameters params;
-  if(fileExists(param_file)){
-    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
-    Grid::XmlReader rd(param_file);
-    read(rd, "LanczosParameters", params);
-  }else if(!GlobalSharedMemory::WorldRank){
-    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
-    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
-    Grid::XmlWriter wr(param_file + ".templ");
-    write(wr, "LanczosParameters", params);
-  }
-
-  FermionFieldD gauss_o(rbGrid);
-  FermionFieldD gauss(Grid);
-  gaussian(rng, gauss);
-  pickCheckerboard(Odd, gauss_o, gauss);
-
-  action.ImportGauge(latt);
-
-  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
-  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
-  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
-  assert(params.mu == 0.0);
-
-  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
-  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
-
-  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
-  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
-
-  std::vector<RealD> eval(params.n_use);
-  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
-  int Nconv;
-  IRL.calc(eval, evec, gauss_o, Nconv);
-
-  std::cout << "Eigenvalues:" << std::endl;
-  for(int i=0;i<params.n_want;i++){
-    std::cout << i << " " << eval[i] << std::endl;
-  }
-}
-
-
-//Check the quality of the RHMC approx
-//action_or_md toggles checking the action (0), MD (1) or both (2) setups
-template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
-void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
-	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
-	       int inv_pow, const std::string &quark_descr, int action_or_md){
-  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
-  
-  FermionFieldD gauss_o(rbGrid);
-  FermionFieldD gauss(Grid);
-  gaussian(rng, gauss);
-  pickCheckerboard(Odd, gauss_o, gauss);
-
-  numOp.ImportGauge(latt);
-  denOp.ImportGauge(latt);
-
-  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
-  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
-  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
-
-  PowerMethod<FermionFieldD> power_method;
-  RealD lambda_max;
-
-  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
-
-  lambda_max = power_method(MdagM,gauss_o);
-  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
-
-  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
-  lambda_max = power_method(VdagV,gauss_o);
-  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
-
-  if(action_or_md == 0 || action_or_md == 2){
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
-    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-  }
-
-  std::cout << "-------------------------------------------------------------------------------" << std::endl;
-
-  if(action_or_md == 1 || action_or_md == 2){
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
-
-    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
-    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
-  }
-}
-
-
-template<typename FermionImplPolicy>
-void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
-	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
-  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
-  typename FermionImplPolicy::FermionField eta(FGrid);
-  RealD scale = std::sqrt(0.5);
-  gaussian(rng,eta); eta = eta * scale;
-
-  //Use the inbuilt check
-  EOFA.refresh(latt, eta);
-  EOFA.S(latt);
-  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
-}
-
-
-template<typename FermionImplPolicy>
-class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
-  LatticeGaugeFieldD &U;
-public:
-  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
-
-  typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
-
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
-  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
-};
-
-template<typename FermionImplPolicy>
-void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
-		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
-  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
-  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
-  typename FermionImplPolicy::FermionField eta(FGrid);
-  gaussian(rng,eta);
-  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
-  auto lambda_max = power_method(linop,eta);
-  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
-}
-
-//Applications of M^{-1} cost the same as M for EOFA!
-template<typename FermionImplPolicy>
-class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
-  LatticeGaugeFieldD &U;
-public:
-  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
-
-  typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
-
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
-  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
-};
-
-template<typename FermionImplPolicy>
-void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
-		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
-  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
-  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
-  typename FermionImplPolicy::FermionField eta(FGrid);
-  gaussian(rng,eta);
-  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
-  auto lambda_max = power_method(linop,eta);
-  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
-}
-
-
-NAMESPACE_BEGIN(Grid);
-
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
-  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-  public:
-    typedef typename FermionOperatorD::FermionField FieldD;
-    typedef typename FermionOperatorF::FermionField FieldF;
-
-    using OperatorFunction<FieldD>::operator();
-
-    RealD   Tolerance;
-    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-    Integer MaxInnerIterations;
-    Integer MaxOuterIterations;
-    GridBase* SinglePrecGrid4; //Grid for single-precision fields
-    GridBase* SinglePrecGrid5; //Grid for single-precision fields
-    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
-
-    FermionOperatorF &FermOpF;
-    FermionOperatorD &FermOpD;;
-    SchurOperatorF &LinOpF;
-    SchurOperatorD &LinOpD;
-
-    Integer TotalInnerIterations; //Number of inner CG iterations
-    Integer TotalOuterIterations; //Number of restarts
-    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
-
-    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
-						    Integer maxinnerit, 
-						    Integer maxouterit, 
-						    GridBase* _sp_grid4, 
-						    GridBase* _sp_grid5, 
-						    FermionOperatorF &_FermOpF,
-						    FermionOperatorD &_FermOpD,
-						    SchurOperatorF   &_LinOpF,
-						    SchurOperatorD   &_LinOpD): 
-      LinOpF(_LinOpF),
-      LinOpD(_LinOpD),
-      FermOpF(_FermOpF),
-      FermOpD(_FermOpD),
-      Tolerance(tol), 
-      InnerTolerance(tol), 
-      MaxInnerIterations(maxinnerit), 
-      MaxOuterIterations(maxouterit), 
-      SinglePrecGrid4(_sp_grid4),
-      SinglePrecGrid5(_sp_grid5),
-      OuterLoopNormMult(100.) 
-    { 
-    };
-
-    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
-
-      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
-
-      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
-
-      precisionChange(FermOpF.Umu, FermOpD.Umu);
-
-      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Make a mixed precision conjugate gradient
-      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
-      MPCG.InnerTolerance = InnerTolerance;
-      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
-      MPCG(src,psi);
-    }
-  };
-
-
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
-  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-  public:
-    typedef typename FermionOperatorD::FermionField FieldD;
-    typedef typename FermionOperatorF::FermionField FieldF;
-
-    using OperatorFunction<FieldD>::operator();
-
-    RealD Tolerance;
-    Integer MaxIterations;
-
-    RealD Delta; //reliable update parameter
-
-    GridBase* SinglePrecGrid4; //Grid for single-precision fields
-    GridBase* SinglePrecGrid5; //Grid for single-precision fields
-
-    FermionOperatorF &FermOpF;
-    FermionOperatorD &FermOpD;;
-    SchurOperatorF &LinOpF;
-    SchurOperatorD &LinOpD;
-    
-    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
-								  RealD delta,
-								  Integer maxit, 
-								  GridBase* _sp_grid4, 
-								  GridBase* _sp_grid5, 
-								  FermionOperatorF &_FermOpF,
-								  FermionOperatorD &_FermOpD,
-								  SchurOperatorF   &_LinOpF,
-								  SchurOperatorD   &_LinOpD): 
-      LinOpF(_LinOpF),
-      LinOpD(_LinOpD),
-      FermOpF(_FermOpF),
-      FermOpD(_FermOpD),
-      Tolerance(tol), 
-      Delta(delta),
-      MaxIterations(maxit), 
-      SinglePrecGrid4(_sp_grid4),
-      SinglePrecGrid5(_sp_grid5)
-    { 
-    };
-
-    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
-
-      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
-
-      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
-
-      precisionChange(FermOpF.Umu, FermOpD.Umu);
-
-      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Make a mixed precision conjugate gradient
-      ////////////////////////////////////////////////////////////////////////////////////
-
-      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
-      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
-      MPCG(src,psi);
-    }
-  };
-
-
-
-NAMESPACE_END(Grid);
-
-
-
-
-
-int main(int argc, char **argv) {
-  Grid_init(&argc, &argv);
-  int threads = GridThread::GetThreads();
-  // here make a routine to print all the relevant information on the run
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
-
-  std::string param_file = "params.xml";
-  bool file_load_check = false;
-  for(int i=1;i<argc;i++){
-    std::string sarg(argv[i]);
-    if(sarg == "--param_file"){
-      assert(i!=argc-1);
-      param_file = argv[i+1];
-    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
-      file_load_check = true;
-    }
-  }
-
-  //Read the user parameters
-  EvolParameters user_params;
-  
-  if(fileExists(param_file)){
-    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
-    Grid::XmlReader rd(param_file);
-    read(rd, "Params", user_params);
-  }else if(!GlobalSharedMemory::WorldRank){
-    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
-    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
-    {
-      Grid::XmlWriter wr(param_file + ".templ");
-      write(wr, "Params", user_params);
-    }
-    std::cout << GridLogMessage << " Done" << std::endl;
-    Grid_finalize();
-    return 0;
-  }
-
-  //Check the parameters
-  if(user_params.GparityDirs.size() != Nd-1){
-    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
-    exit(1);
-  }
-  for(int i=0;i<Nd-1;i++)
-    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
-      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
-      exit(1);
-    }
-
-
-  typedef GparityMobiusEOFAFermionD EOFAactionD;
-  typedef GparityMobiusFermionD FermionActionD;
-  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
-  typedef typename FermionActionD::FermionField FermionFieldD;
-
-  typedef GparityMobiusEOFAFermionF EOFAactionF;
-  typedef GparityMobiusFermionF FermionActionF;
-  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
-  typedef typename FermionActionF::FermionField FermionFieldF;
-
-  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
-  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
-
-  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-  IntegratorParameters MD;
-  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
-  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
-  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = user_params.Steps;
-  MD.trajL   = user_params.TrajectoryLength;
-
-  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = user_params.StartTrajectory;
-  HMCparams.Trajectories     = user_params.Trajectories;
-  HMCparams.NoMetropolisUntil= 0;
-  HMCparams.StartingType     = user_params.StartingType;
-  HMCparams.MetropolisTest = user_params.MetropolisTest;
-  HMCparams.MD = MD;
-  HMCWrapper TheHMC(HMCparams);
-
-  // Grid from the command line arguments --grid and --mpi
-  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
-  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_lat";
-  CPparams.rng_prefix    = "ckpoint_rng";
-  CPparams.saveInterval  = user_params.SaveInterval;
-  CPparams.format        = "IEEE64BIG";
-  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
-  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
-  RNGModuleParameters RNGpar;
-  RNGpar.serial_seeds = "1 2 3 4 5";
-  RNGpar.parallel_seeds = "6 7 8 9 10";
-  TheHMC.Resources.SetRNGSeeds(RNGpar);
-
-  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
-  TheHMC.Resources.AddObservable<PlaqObs>();
-  //////////////////////////////////////////////
-
-  //aiming for ainv=2.068             me          Bob
-  //Estimated  a(ml+mres) [48ID] = 0.001048    0.00104 
-  //           a(mh+mres) [48ID] = 0.028847    0.02805
-  //Estimate Ls=12, b+c=2  mres~0.0003
-
-  const int Ls      = 12;
-  Real beta         = 1.946;
-  Real light_mass   = 0.00074;   //0.00104 - mres_approx;
-  Real strange_mass = 0.02775;    //0.02805 - mres_approx
-  Real pv_mass      = 1.0;
-  RealD M5  = 1.8;
-  RealD mobius_scale = 2.; //b+c
-
-  RealD mob_bmc = 1.0;
-  RealD mob_b = (mobius_scale + mob_bmc)/2.;
-  RealD mob_c = (mobius_scale - mob_bmc)/2.;
-
-  //Setup the Grids
-  auto UGridD   = TheHMC.Resources.GetCartesian();
-  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
-  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
-  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
-
-  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
-  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
-  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
-  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
-
-  ConjugateIwasakiGaugeActionD GaugeAction(beta);
-
-  // temporarily need a gauge field
-  LatticeGaugeFieldD Ud(UGridD);
-  LatticeGaugeFieldF Uf(UGridF);
- 
-  //Setup the BCs
-  FermionActionD::ImplParams Params;
-  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
-  Params.twists[Nd-1] = 1; //APBC in time direction
-
-  std::vector<int> dirs4(Nd);
-  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
-  dirs4[Nd-1] = 0; //periodic gauge BC in time
-
-  GaugeImplPolicy::setDirections(dirs4); //gauge BC
-
-  //Run optional gauge field checksum checker and exit
-  if(file_load_check){
-    TheHMC.initializeGaugeFieldAndRNGs(Ud);
-    std::cout << GridLogMessage << " Done" << std::endl;
-    Grid_finalize();
-    return 0;
-  }
-
-
-  ////////////////////////////////////
-  // Collect actions
-  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
-  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
-  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
-
-
-  /////////////////////////////////////////////////////////////
-  // Light EOFA action
-  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
-  /////////////////////////////////////////////////////////////
-  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
-  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
-  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
-  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
-  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
-  
-  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
-  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
-  int n_light_hsb = 5;
-  assert(user_params.eofa_l.size() == n_light_hsb);
-  
-  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
-
-  for(int i=0;i<n_light_hsb;i++){
-    RealD iml = eofa_light_masses[i];
-    RealD ipv = eofa_pv_masses[i];
-
-    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
-    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
-    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
-    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
-
-    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
-    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
-    
-    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
-    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
-
-#if 1
-    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
-    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-
-    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-
-#else
-    
-    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
-    
-    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
-    
-    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
-    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
-    
-    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
-    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
-    
-    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
-    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
-#endif
-
-    
-    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
-							*LopD, *RopD, 
-							*ActionMCG_L, *ActionMCG_R, 
-							*ActionMCG_L, *ActionMCG_R, 
-							*DerivMCG_L, *DerivMCG_R, 
-							user_params.eofa_l[i].rat_params, true);
-    EOFA_pfactions[i] = EOFA;
-    Level1.push_back(EOFA);
-  }
-
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
-  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
-
-  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
-  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
-
-  RationalActionParams rat_act_params_s;
-  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
-  rat_act_params_s.precision= 60;
-  rat_act_params_s.MaxIter  = 10000;
-  user_params.rat_quo_s.Export(rat_act_params_s);
-  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
-
-  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
-  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
-  Level1.push_back(&Quotient_s);  
-
-  ///////////////////////////////////
-  // DSDR action
-  ///////////////////////////////////
-  RealD dsdr_mass=-1.8;   
-  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
-  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
-  RealD dsdr_epsilon_b = 0.5; 
-  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
-  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
-
-  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
-  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
- 
-  RationalActionParams rat_act_params_DSDR;
-  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
-  rat_act_params_DSDR.precision= 60;
-  rat_act_params_DSDR.MaxIter  = 10000;
-  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
-  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
-
-  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
-  Level2.push_back(&Quotient_DSDR);
-
-  /////////////////////////////////////////////////////////////
-  // Gauge action
-  /////////////////////////////////////////////////////////////
-  Level3.push_back(&GaugeAction);
-
-  TheHMC.TheAction.push_back(Level1);
-  TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
-  std::cout << GridLogMessage << " Action complete "<< std::endl;
-
-
-  //Action tuning
-  bool 
-    tune_rhmc_s=false, eigenrange_s=false, 
-    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
-    check_eofa=false, 
-    upper_bound_eofa=false, lower_bound_eofa(false);
-
-  std::string lanc_params_s;
-  std::string lanc_params_DSDR;
-  int tune_rhmc_s_action_or_md;
-  int tune_rhmc_DSDR_action_or_md;
-  int eofa_which_hsb;
-
-  for(int i=1;i<argc;i++){
-    std::string sarg(argv[i]);
-    if(sarg == "--tune_rhmc_s"){
-      assert(i < argc-1);
-      tune_rhmc_s=true;
-      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
-    }
-    else if(sarg == "--eigenrange_s"){
-      assert(i < argc-1);
-      eigenrange_s=true;
-      lanc_params_s = argv[i+1];
-    }
-    else if(sarg == "--tune_rhmc_DSDR"){
-      assert(i < argc-1);
-      tune_rhmc_DSDR=true;
-      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
-    }
-    else if(sarg == "--eigenrange_DSDR"){
-      assert(i < argc-1);
-      eigenrange_DSDR=true;
-      lanc_params_DSDR = argv[i+1];
-    }
-    else if(sarg == "--check_eofa"){
-      assert(i < argc-1);
-      check_eofa = true;
-      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
-      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
-    }
-    else if(sarg == "--upper_bound_eofa"){
-      assert(i < argc-1);
-      upper_bound_eofa = true;
-      eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
-    }
-    else if(sarg == "--lower_bound_eofa"){
-      assert(i < argc-1);
-      lower_bound_eofa = true;      
-      eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
-    }
-  }
-  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
-    std::cout << GridLogMessage << "Running checks" << std::endl;
-    TheHMC.initializeGaugeFieldAndRNGs(Ud);
-
-    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
-    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
-
-
-    if(check_eofa){
-      if(eofa_which_hsb >= 0){
-	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
-	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
-      }else{
-	for(int i=0;i<n_light_hsb;i++){
-	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
-	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
-	}
-      }
-    }	  
-    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
-    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
-    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
-    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
-    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
-
-
-    std::cout << GridLogMessage << " Done" << std::endl;
-    Grid_finalize();
-    return 0;
-  }
-
-
-  //Run the HMC
-  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
-  TheHMC.Run();
-
-  std::cout << GridLogMessage << " Done" << std::endl;
-  Grid_finalize();
-  return 0;
-} // main
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
@ -128,8 +128,14 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#if 1
+      RealD delta=1.e-4;
+      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
+#else      
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#endif
      MPCG(src,psi);
    }
  };
@ -141,6 +147,10 @@ int main(int argc, char **argv) {
  using namespace Grid;

  Grid_init(&argc, &argv);
+
+  CartesianCommunicator::BarrierWorld();
+  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
+  
  int threads = GridThread::GetThreads();

   // Typedefs to simplify notation
@ -161,7 +171,7 @@ int main(int argc, char **argv) {
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps =  4;
+  MD.MDsteps =  6;
  MD.trajL   = 1.0;

  HMCparameters HMCparams;
@ -183,7 +193,7 @@ int main(int argc, char **argv) {
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
+  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
@ -204,7 +214,8 @@ int main(int argc, char **argv) {
  Real light_mass   = 7.8e-4;
  Real strange_mass = 0.02132;
  Real pv_mass      = 1.0;
-  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });

  // FIXME:
  // Same in MC and MD
@ -287,6 +298,7 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
+  std::cout << "loaded NERSC gauge field"<<std::endl;


  // These lines are unecessary if BC are all periodic
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc
@ -0,0 +1,474 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+      /* Debugging instances of objects; references are stored
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+      */
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+      // Assumption made in code to extract gauge field
+      // We could avoid storing LinopD reference alltogether ?
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Must snarf a single precision copy of the gauge field in Linop_d argument
+      ////////////////////////////////////////////////////////////////////////////////////
+      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+
+      GridBase * GridPtrF = SinglePrecGrid4;
+      GridBase * GridPtrD = FermOpD.Umu.Grid();
+      GaugeFieldF     U_f  (GridPtrF);
+      GaugeLinkFieldF Umu_f(GridPtrF);
+      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Moving this to a Clone method of fermion operator would allow to duplicate the 
+      // physics parameters and decrease gauge field copies
+      ////////////////////////////////////////////////////////////////////////////////////
+      GaugeLinkFieldD Umu_d(GridPtrD);
+      for(int mu=0;mu<Nd*2;mu++){ 
+	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+	precisionChange(Umu_f,Umu_d);
+	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      }
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+#if 1
+      RealD delta=1.e-4;
+      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
+#else      
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#endif
+      MPCG(src,psi);
+    }
+  };
+
+NAMESPACE_END(Grid);
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonImplF FermionImplPolicyF;
+
+  typedef MobiusFermionR FermionAction;
+  typedef MobiusFermionF FermionActionF;
+  typedef typename FermionAction::FermionField FermionField;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps =  6;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 1077;
+  HMCparams.Trajectories     = 1;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_DDHMC_lat";
+  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  RealD M5  = 1.8;
+  RealD b   = 1.5;
+  RealD c   = 0.5;
+  Real beta         = 2.31;
+  //  Real light_mass   = 5.4e-4;
+  Real light_mass   = 7.8e-4;
+  Real strange_mass = 0.02132;
+  Real pv_mass      = 1.0;
+  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+
+  // FIXME:
+  // Same in MC and MD
+  // Need to mix precision too
+  OneFlavourRationalParams SFRp; // Strange
+  SFRp.lo       = 4.0e-3;
+  SFRp.hi       = 90.0;
+  SFRp.MaxIter  = 60000;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 1.0e-6;
+  SFRp.degree   = 12;
+  SFRp.precision= 50;
+  SFRp.BoundsCheckFreq=0;
+
+  OneFlavourRationalParams OFRp; // Up/down
+  OFRp.lo       = 2.0e-5;
+  OFRp.hi       = 90.0;
+  OFRp.MaxIter  = 60000;
+  OFRp.tolerance= 1.0e-8;
+  OFRp.mdtolerance= 1.0e-6;
+  //  OFRp.degree   = 20; converges
+  //  OFRp.degree   = 16;
+  OFRp.degree   = 12;
+  OFRp.precision= 80;
+  OFRp.BoundsCheckFreq=0;
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
+
+  ////////////////////////////////////////////////////////////////
+  // Domain decomposed
+  ////////////////////////////////////////////////////////////////
+  Coordinate latt4  = GridPtr->GlobalDimensions();
+  Coordinate mpi    = GridPtr->ProcessorGrid();
+  Coordinate shm;
+
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+  
+  Coordinate CommDim(Nd);
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+
+  Coordinate NonDirichlet(Nd+1,0);
+  Coordinate Dirichlet(Nd+1,0);
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Coordinate Block4(Nd);
+  Block4[0] = Dirichlet[1];
+  Block4[1] = Dirichlet[2];
+  Block4[2] = Dirichlet[3];
+  Block4[3] = Dirichlet[4];
+
+  int Width=3;
+  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4,Width));
+
+  //////////////////////////
+  // Fermion Grids
+  //////////////////////////
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi);
+  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeFieldF UF(GridPtrF);
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  Params.dirichlet=NonDirichlet;
+  FermionAction::ImplParams ParamsDir(boundary);
+  ParamsDir.dirichlet=Dirichlet;
+
+  //  double StoppingCondition = 1e-14;
+  //  double MDStoppingCondition = 1e-9;
+  double StoppingCondition = 1e-10;
+  double MDStoppingCondition = 1e-7;
+  double MDStoppingConditionLoose = 1e-6;
+  double MaxCGIterations = 300000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  ActionLevel<HMCWrapper::Field> Level3(8);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir);
+  FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, ParamsDir);
+  
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
+  Level1.push_back(&StrangePseudoFermionBdy); // ok
+  Level2.push_back(&StrangePseudoFermionLocal);
+  Level1.push_back(&StrangePseudoFermionPVBdy); //ok
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+  std::vector<int> dirichlet_den;
+  std::vector<int> dirichlet_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
+  }
+
+  for(int h=0;h<n_hasenbusch;h++){
+    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
+  }
+  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionActionF *> NumeratorsF;
+  std::vector<FermionAction *> Denominators;
+  std::vector<FermionActionF *> DenominatorsF;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+#define MIXED_PRECISION
+#ifdef MIXED_PRECISION
+  std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
+#else
+  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+#endif
+  std::vector<MxPCG *> ActionMPCG;
+  std::vector<MxPCG *> MPCG;
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  std::vector<LinearOperatorD *> LinOpD;
+  std::vector<LinearOperatorF *> LinOpF; 
+  
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage
+	      << " 2f quotient Action ";
+    std::cout << "det D("<<light_den[h]<<")";
+    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
+    std::cout << "/ det D("<<light_num[h]<<")";
+    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
+    std::cout << std::endl;
+
+    FermionAction::ImplParams ParamsNum(boundary);
+    FermionAction::ImplParams ParamsDen(boundary);
+    FermionActionF::ImplParams ParamsNumF(boundary);
+    FermionActionF::ImplParams ParamsDenF(boundary);
+    
+    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
+    else                      ParamsNum.dirichlet = NonDirichlet;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
+
+    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
+    else                      ParamsDen.dirichlet = NonDirichlet;
+
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
+
+    ParamsDenF.dirichlet = ParamsDen.dirichlet;
+    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
+
+    ParamsNumF.dirichlet = ParamsNum.dirichlet;
+    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
+
+    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
+    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
+
+    double conv  = MDStoppingCondition;
+    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
+    const int MX_inner = 5000;
+    MPCG.push_back(new MxPCG(conv,
+			     MX_inner,
+			     MaxCGIterations,
+			     GridPtrF,
+			     FrbGridF,
+			     *DenominatorsF[h],*Denominators[h],
+			     *LinOpF[h], *LinOpD[h]) );
+
+    ActionMPCG.push_back(new MxPCG(StoppingCondition,
+				   MX_inner,
+				   MaxCGIterations,
+				   GridPtrF,
+				   FrbGridF,
+				   *DenominatorsF[h],*Denominators[h],
+				   *LinOpF[h], *LinOpD[h]) );
+
+    
+    if(h!=0) {
+      //      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
+      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
+    } else {
+#ifdef MIXED_PRECISION
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
+			   *Numerators[h],*Denominators[h],
+			   *NumeratorsF[h],*DenominatorsF[h],
+			   OFRp, 500) );
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
+			   *Numerators[h],*Denominators[h],
+			   *NumeratorsF[h],*DenominatorsF[h],
+			   OFRp, 500) );
+#else
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+#endif
+    }
+  }
+
+  int nquo=Quotients.size();
+  Level1.push_back(Bdys[0]);
+  Level1.push_back(Bdys[1]);
+  for(int h=0;h<nquo-1;h++){
+    Level2.push_back(Quotients[h]);
+  }
+  Level2.push_back(Quotients[nquo-1]);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+  /////////////////////////////////////////////////////////////
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@ -98,9 +98,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@ -141,9 +139,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      DwD.Dhop(src_d,result_d,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid_d->Barrier();
--- a/tests/IO/Test_field_array_io.cc
+++ b/tests/IO/Test_field_array_io.cc
@ -1,184 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/IO/Test_field_array_io.cc
-
-    Copyright (C) 2015
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-//This test demonstrates and checks a single-file write of an arbitrary array of fields
-
-uint64_t writeHeader(const uint32_t size, const uint32_t checksum, const std::string &format, const std::string &file){
-  std::ofstream fout(file,std::ios::out|std::ios::in);
-  fout.seekp(0,std::ios::beg);
-  fout << std::setw(10) << size << std::endl;
-  fout << std::hex << std::setw(10) << checksum << std::endl;
-  fout << format << std::endl;
-  return fout.tellp();
-}
- 
-uint64_t readHeader(uint32_t &size, uint32_t &checksum, std::string &format, const std::string &file){
-  std::ifstream fin(file);
-  std::string line;
-  getline(fin,line);
-  {
-    std::stringstream ss; ss <<line ; ss >> size;
-  }
-  getline(fin,line);
-  {
-    std::stringstream ss; ss <<line ; ss >> std::hex >> checksum;
-  }
-  getline(fin,format);
-  removeWhitespace(format);
-      
-  return fin.tellg();
-}
- 
-template<typename FieldType>
-void writeFieldArray(const std::string &file, const std::vector<FieldType> &data){
-  typedef typename FieldType::vector_object vobj;
-  typedef typename FieldType::scalar_object sobj;
-  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
-  BinarySimpleMunger<sobj, sobj> munge; //straight copy
-
-  //We need a 2-pass header write, first to establish the size, the second pass writes the checksum
-  std::string format = getFormatString<typename FieldType::vector_object>();
-
-  uint64_t offset; //leave 64 bits for header
-  if ( grid->IsBoss() ) { 
-    NerscIO::truncate(file);
-    offset = writeHeader(data.size(), 0, format, file);
-  }
-  grid->Broadcast(0,(void *)&offset,sizeof(offset)); //use as a barrier
-
-  std::cout << "Data offset write " << offset << std::endl;
-  std::cout << "Data size write " << data.size() << std::endl;
-  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
-  std::cout << "Field size = " << field_size << " B" << std::endl;
-
-  uint32_t checksum = 0;
-  for(int i=0;i<data.size();i++){
-    std::cout << "Data field write " << i << " offset " << offset << std::endl;
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::writeLatticeObject<vobj,sobj>(const_cast<FieldType &>(data[i]),file,munge,offset,format,
-					    nersc_csum,scidac_csuma,scidac_csumb);
-    offset += field_size;
-    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
-  }
-  std::cout << "Write checksum " << checksum << std::endl;
-
-  if ( grid->IsBoss() ) { 
-    writeHeader(data.size(), checksum, format, file);
-  }
-}
-
-
-template<typename FieldType>
-void readFieldArray(std::vector<FieldType> &data, const std::string &file){
-  typedef typename FieldType::vector_object vobj;
-  typedef typename FieldType::scalar_object sobj;
-  assert(data.size() > 0);
-  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
-  BinarySimpleUnmunger<sobj, sobj> munge; //straight copy
-  
-  uint32_t hdr_checksum, hdr_size;
-  std::string format;
-  uint64_t offset = readHeader(hdr_size, hdr_checksum, format, file);
-  
-  std::cout << "Data offset read " << offset << std::endl;  
-  std::cout << "Data size read " << hdr_size << std::endl;
-  assert(data.size() == hdr_size);
-
-  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
-
-  uint32_t checksum = 0;
-
-  for(int i=0;i<data.size();i++){
-    std::cout << "Data field read " << i << " offset " << offset << std::endl;
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::readLatticeObject<vobj,sobj>(data[i],file,munge,offset,format,
-					   nersc_csum,scidac_csuma,scidac_csumb);
-    offset += field_size;
-    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
-  }
-
-  std::cout << "Header checksum " << hdr_checksum << std::endl;    
-  std::cout << "Read checksum " << checksum << std::endl;
-    
-
-  assert( hdr_checksum == checksum );
-}
-
-
-
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  Coordinate latt   = GridDefaultLatt();
-  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-  Coordinate mpi_layout  = GridDefaultMpi();
-
-  const int Ls=8;
-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt, simd_layout, mpi_layout);
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-  GridParallelRNG RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  typedef DomainWallFermionD::FermionField FermionField;
-
-  int nfield = 20;
-  std::vector<FermionField> data(nfield, FGrid);
-
-  for(int i=0;i<data.size();i++)
-    gaussian(RNG5, data[i]);
-  
-  std::string file = "test_field_array_io.0";
-  writeFieldArray(file, data);
-
-  std::vector<FermionField> data_r(nfield, FGrid);
-  readFieldArray(data_r, file);
-  
-  for(int i=0;i<nfield;i++){
-    FermionField diff = data_r[i] - data[i];
-    RealD norm_diff = norm2(diff);
-    std::cout << "Norm2 of difference between stored and loaded data index " << i << " : " << norm_diff << std::endl;
-  }
-  
-  std::cout << "Done" << std::endl;
-
-  Grid_finalize();
-}
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@ -95,26 +95,34 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  double t1,t2,flops;
+  double MdagMsiteflops = 1452; // Mobius (real coeffs)
+  // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
+  double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
+  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
+  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;
-  for(int i=0;i<100;i++){
+  for(int i=0;i<200;i++){
    result_o = Zero();
    t1=usecond();
    mCG(src_o,result_o);
    t2=usecond();
    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
-    flops = 1320.0*2*FGrid->gSites()*iters;
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
+    flops+= CGsiteflops*FrbGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  for(int i=0;i<100;i++){
+  for(int i=0;i<1;i++){
    result_o_2 = Zero();
    t1=usecond();
    CG(HermOpEO,src_o,result_o_2);
    t2=usecond();
    iters = CG.IterationsToComplete;
-    flops = 1320.0*2*FGrid->gSites()*iters;
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
+    flops+= CGsiteflops*FrbGrid->gSites()*iters;
+    
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
--- a/tests/lanczos/Test_compressed_lanczos_gparity.cc
+++ b/tests/lanczos/Test_compressed_lanczos_gparity.cc
@ -1,485 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_compressed_lanczos_gparity.cc
-
-    Copyright (C) 2017
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Leans heavily on Christoph Lehner's code
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-/*
- *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
- *  in Grid that were intended to be used to support blocked Aggregates, from
- */
-#include <Grid/Grid.h>
-#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
-
-using namespace std;
-using namespace Grid;
-
-//For the CPS configurations we have to manually seed the RNG and deal with an incorrect factor of 2 in the plaquette metadata
-void readConfiguration(LatticeGaugeFieldD &U,
-		       const std::string &config,
-		       bool is_cps_cfg = false){
-
-  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = false;
-
-  typedef GaugeStatistics<ConjugateGimplD> GaugeStats;
-     
-  FieldMetaData header;
-  NerscIO::readConfiguration<GaugeStats>(U, header, config);
-
-  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = true;
-}
-
-//Lanczos parameters in CPS conventions
-struct CPSLanczosParams : Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(CPSLanczosParams,
-				  RealD, alpha,
-				  RealD, beta,
-				  int, ch_ord,
-				  int, N_use,
-				  int, N_get,
-				  int, N_true_get,
-				  RealD, stop_rsd,
-				  int, maxits);
-
-  //Translations
-  ChebyParams getChebyParams() const{
-    ChebyParams out;
-    out.alpha = beta*beta; //aka lo
-    out.beta = alpha*alpha; //aka hi
-    out.Npoly = ch_ord+1;
-    return out;
-  }
-  int Nstop() const{ return N_true_get; }
-  int Nm() const{ return N_use; }
-  int Nk() const{ return N_get; }
-};
-
-//Maybe this class should be in the main library?
-template<class Fobj,class CComplex,int nbasis>
-class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
-{ 
-public:
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj>          FineField;
-
-  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
-			      LinearOperatorBase<FineField> &FineOp,
-			      int checkerboard) 
-    // Base constructor
-    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
-  {};
-
-  void checkpointFine(std::string evecs_file,std::string evals_file)
-  {
-    assert(this->subspace.size()==nbasis);
-    emptyUserRecord record;
-    Grid::ScidacWriter WR(this->_FineGrid->IsBoss());
-    WR.open(evecs_file);
-    for(int k=0;k<nbasis;k++) {
-      WR.writeScidacFieldRecord(this->subspace[k],record);
-    }
-    WR.close();
-    
-    XmlWriter WRx(evals_file);
-    write(WRx,"evals",this->evals_fine);
-  }
-
-  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
-  {
-    this->evals_fine.resize(nbasis);
-    this->subspace.resize(nbasis,this->_FineGrid);
-    
-    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
-    XmlReader RDx(evals_file);
-    read(RDx,"evals",this->evals_fine);
-
-    if(this->evals_fine.size() < nbasis) assert(0 && "Not enough fine evals to complete basis");
-    if(this->evals_fine.size() > nbasis){ //allow the use of precomputed evecs with a larger #evecs
-      std::cout << GridLogMessage << "Truncating " << this->evals_fine.size() << " evals to basis size " << nbasis << std::endl;
-      this->evals_fine.resize(nbasis);
-    }     
-    
-    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
-    emptyUserRecord record;
-    Grid::ScidacReader RD ;
-    RD.open(evecs_file);
-    for(int k=0;k<nbasis;k++) {
-      this->subspace[k].Checkerboard()=this->_checkerboard;
-      RD.readScidacFieldRecord(this->subspace[k],record);
-      
-    }
-    RD.close();
-  }
-
-  void checkpointCoarse(std::string evecs_file,std::string evals_file)
-  {
-    int n = this->evec_coarse.size();
-    emptyUserRecord record;
-    Grid::ScidacWriter WR(this->_CoarseGrid->IsBoss());
-    WR.open(evecs_file);
-    for(int k=0;k<n;k++) {
-      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
-    }
-    WR.close();
-    
-    XmlWriter WRx(evals_file);
-    write(WRx,"evals",this->evals_coarse);
-  }
-
-  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
-  {
-    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
-    this->evals_coarse.resize(nvec);
-    this->evec_coarse.resize(nvec,this->_CoarseGrid);
-    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
-    XmlReader RDx(evals_file);
-    read(RDx,"evals",this->evals_coarse);
-
-    assert(this->evals_coarse.size()==nvec);
-    emptyUserRecord record;
-    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
-    Grid::ScidacReader RD ;
-    RD.open(evecs_file);
-    for(int k=0;k<nvec;k++) {
-      RD.readScidacFieldRecord(this->evec_coarse[k],record);
-    }
-    RD.close();
-  }
-};
-
-struct Options{
-  std::vector<int> blockSize;
-  std::vector<int> GparityDirs;
-  int Ls;
-  RealD mass;
-  RealD M5;
-  RealD mobius_scale;
-  std::string config;
-  bool is_cps_cfg;
-
-  double coarse_relax_tol;
-  int smoother_ord;
-  
-  CPSLanczosParams fine;
-  CPSLanczosParams coarse;
-
-  bool write_fine = false;
-  std::string write_fine_file;
-
-  bool read_fine = false;
-  std::string read_fine_file;
-
-  bool write_coarse = false;
-  std::string write_coarse_file;
-
-  bool read_coarse = false;
-  std::string read_coarse_file;
-
-  
-  Options(){
-    blockSize = std::vector<int> ({2,2,2,2,2});
-    GparityDirs = std::vector<int> ({1,1,1}); //1 for each GP direction
-    
-    Ls = 12;
-    mass = 0.01;
-    M5 = 1.8;
-    is_cps_cfg = false;
-    mobius_scale = 2.0;
-    
-    fine.alpha = 2;
-    fine.beta = 0.1;
-    fine.ch_ord = 100;
-    fine.N_use = 70;
-    fine.N_get = 60;
-    fine.N_true_get = 60;
-    fine.stop_rsd = 1e-8;
-    fine.maxits = 10000;
-
-    coarse.alpha = 2;
-    coarse.beta = 0.1;
-    coarse.ch_ord = 100;
-    coarse.N_use = 200;
-    coarse.N_get = 190;
-    coarse.N_true_get = 190;
-    coarse.stop_rsd = 1e-8;
-    coarse.maxits = 10000;
-
-    coarse_relax_tol = 1e5;
-    smoother_ord = 20;
-
-    write_fine = false;
-    read_fine = false;
-    write_coarse = false;
-    read_coarse = false;
-  }
-};  
-
-template<int nbasis>
-void runTest(const Options &opt){
-	        //Fine grids
-  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),  GridDefaultSimd(Nd,vComplex::Nsimd()),   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(opt.Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(opt.Ls,UGrid);
-
-  //Setup G-parity BCs
-  assert(Nd == 4);
-  std::vector<int> dirs4(4);
-  for(int i=0;i<3;i++) dirs4[i] = opt.GparityDirs[i];
-  dirs4[3] = 0; //periodic gauge BC in time
-  
-  std::cout << GridLogMessage << "Gauge BCs: " << dirs4 << std::endl;
-  ConjugateGimplD::setDirections(dirs4); //gauge BC
-
-  GparityWilsonImplD::ImplParams Params;
-  for(int i=0;i<Nd-1;i++) Params.twists[i] = opt.GparityDirs[i]; //G-parity directions
-  Params.twists[Nd-1] = 1; //APBC in time direction
-  std::cout << GridLogMessage << "Fermion BCs: " << Params.twists << std::endl;
-  
-  //Read the gauge field
-  LatticeGaugeField Umu(UGrid);  
-  readConfiguration(Umu, opt.config, opt.is_cps_cfg);
-
-  //Setup the coarse grids  
-  auto fineLatt     = GridDefaultLatt();
-  Coordinate coarseLatt(4);
-  for (int d=0;d<4;d++){
-    coarseLatt[d] = fineLatt[d]/opt.blockSize[d];    assert(coarseLatt[d]*opt.blockSize[d]==fineLatt[d]);
-  }
-
-  std::cout << GridLogMessage<< " 5d coarse lattice is ";
-  for (int i=0;i<4;i++){
-    std::cout << coarseLatt[i]<<"x";
-  } 
-  int cLs = opt.Ls/opt.blockSize[4]; assert(cLs*opt.blockSize[4]==opt.Ls);
-  std::cout << cLs<<std::endl;
-  
-  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
-  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
-
-  //Dirac operator
-  double bmc =  1.;      
-  double b = (opt.mobius_scale + bmc)/2.;  // b = 1/2 [ (b+c) + (b-c) ]
-  double c = (opt.mobius_scale - bmc)/2.;  // c = 1/2 [ (b+c) - (b-c) ]
-  
-  GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, opt.mass, opt.M5, b,c,Params);
-  typedef GparityMobiusFermionD::FermionField FermionField;
-  
-  SchurDiagTwoOperator<GparityMobiusFermionD, FermionField> SchurOp(action);
-
-  typedef GparityWilsonImplD::SiteSpinor SiteSpinor;
-
-  const CPSLanczosParams &fine = opt.fine;
-  const CPSLanczosParams &coarse = opt.coarse;
-
-  std::cout << GridLogMessage << "Keep " << fine.N_true_get   << " fine   vectors" << std::endl;
-  std::cout << GridLogMessage << "Keep " << coarse.N_true_get << " coarse vectors" << std::endl;
-  assert(coarse.N_true_get >= fine.N_true_get);
-
-  assert(nbasis<=fine.N_true_get);
-  LocalCoherenceLanczosScidac<SiteSpinor,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,SchurOp,Odd);
-  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
- 
-  //Compute and/or read fine evecs
-  if(opt.read_fine){
-    _LocalCoherenceLanczos.checkpointFineRestore(opt.read_fine_file + "_evecs.scidac", opt.read_fine_file + "_evals.xml");
-  }else{
-    std::cout << GridLogMessage << "Performing fine grid IRL" << std::endl;
-    std::cout << GridLogMessage << "Using Chebyshev alpha=" << fine.alpha << " beta=" << fine.beta << " ord=" << fine.ch_ord << std::endl;
-    _LocalCoherenceLanczos.calcFine(fine.getChebyParams(),
-				    fine.Nstop(),fine.Nk(),fine.Nm(),
-				    fine.stop_rsd,fine.maxits,0,0);
-    if(opt.write_fine){
-      std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
-      _LocalCoherenceLanczos.checkpointFine(opt.write_fine_file + "_evecs.scidac", opt.write_fine_file + "_evals.xml");
-    }
-  }
-  
-  //Block orthonormalise (this should be part of calcFine?)
-  std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
-  _LocalCoherenceLanczos.Orthogonalise();
-  std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
-
-  ChebyParams smoother = fine.getChebyParams();
-  smoother.Npoly = opt.smoother_ord+1;
-
-  if(opt.read_coarse){
-    _LocalCoherenceLanczos.checkpointCoarseRestore(opt.read_coarse_file + "_evecs.scidac", opt.read_coarse_file + "_evals.xml",coarse.Nstop());
-
-  }else{
-    std::cout << GridLogMessage << "Performing coarse grid IRL" << std::endl;
-    std::cout << GridLogMessage << "Using Chebyshev alpha=" << coarse.alpha << " beta=" << coarse.beta << " ord=" << coarse.ch_ord << std::endl;	
-    _LocalCoherenceLanczos.calcCoarse(coarse.getChebyParams(), smoother, opt.coarse_relax_tol,
-				      coarse.Nstop(), coarse.Nk() ,coarse.Nm(),
-				      coarse.stop_rsd, coarse.maxits, 
-				      0,0);
-
-    if(opt.write_coarse){
-      std::cout << GridLogIRL<<"Checkpointing Coarse evecs"<<std::endl;
-      _LocalCoherenceLanczos.checkpointCoarse(opt.write_coarse_file + "_evecs.scidac", opt.write_coarse_file + "_evals.xml");
-    }
-
-  }
-
-  //Test the eigenvectors
-  //To remove high-frequency noise we apply a Chebyshev smoothing
-  Chebyshev<FermionField> cheb_smoother(smoother);
-    
-  FermionField evec(FrbGrid);
-  FermionField evec_sm(FrbGrid); //smoothed
-  FermionField tmp(FrbGrid);
-  RealD eval;
-  
-  for(int i=0;i<coarse.N_true_get;i++){    
-    _LocalCoherenceLanczos.getFineEvecEval(evec, eval, i);
-
-    //Check unsmoothed evec
-    SchurOp.HermOp(evec, tmp);
-    tmp = tmp - eval*evec;
-    RealD norm_unsmoothed = sqrt(norm2(tmp));
-    
-    //Check smoothed evec
-    cheb_smoother(SchurOp, evec, evec_sm);   
-    SchurOp.HermOp(evec_sm, tmp);
-    tmp = tmp - eval*evec_sm;
-    RealD norm_smoothed = sqrt(norm2(tmp));
-    
-    std::cout << GridLogMessage << "Eval " << eval << " unsmoothed resid " << norm_unsmoothed << " smoothed resid " << norm_smoothed << std::endl;
-  }
-}
-
-
-//Note:  because we rely upon physical properties we must use a "real" gauge configuration
-int main (int argc, char ** argv) {
-  Grid_init(&argc,&argv);
-  GridLogIRL.TimingMode(1);
-
-  Options opt;
-  int basis_size = 100;
-  
-  if(argc < 3){
-    std::cout << GridLogMessage << "Usage: <exe> <config> <gparity dirs> <options>" << std::endl;
-    std::cout << GridLogMessage << "<gparity dirs> should have the format a.b.c where a,b,c are 0,1 depending on whether there are G-parity BCs in that direction" << std::endl;
-    std::cout << GridLogMessage << "Options:" << std::endl;
-    std::cout << GridLogMessage << "--Ls <value> : Set Ls (default 12)" << std::endl;
-    std::cout << GridLogMessage << "--mass <value> : Set the mass (default 0.01)" << std::endl;
-    std::cout << GridLogMessage << "--block <value> : Set the block size. Format should be a.b.c.d.e where a-e are the block extents  (default 2.2.2.2.2)" << std::endl;
-    std::cout << GridLogMessage << "--is_cps_cfg : Indicate that the configuration was generated with CPS where until recently the stored plaquette was wrong by a factor of 2" << std::endl;
-    std::cout << GridLogMessage << "--write_irl_templ: Write a template for the parameters file of the Lanczos to \"irl_templ.xml\"" << std::endl;
-    std::cout << GridLogMessage << "--read_irl_fine <filename>: Real the parameters file for the fine Lanczos" << std::endl;
-    std::cout << GridLogMessage << "--read_irl_coarse <filename>: Real the parameters file for the coarse Lanczos" << std::endl;
-    std::cout << GridLogMessage << "--write_fine <filename stub>: Write fine evecs/evals to filename starting with the stub" << std::endl;
-    std::cout << GridLogMessage << "--read_fine <filename stub>: Read fine evecs/evals from filename starting with the stub" << std::endl;
-    std::cout << GridLogMessage << "--write_coarse <filename stub>: Write coarse evecs/evals to filename starting with the stub" << std::endl;
-    std::cout << GridLogMessage << "--read_coarse <filename stub>: Read coarse evecs/evals from filename starting with the stub" << std::endl;
-    std::cout << GridLogMessage << "--smoother_ord :  Set the Chebyshev order of the smoother (default 20)" << std::endl;
-    std::cout << GridLogMessage << "--coarse_relax_tol : Set the relaxation parameter for evaluating the residual of the reconstructed eigenvectors outside of the basis (default 1e5)" << std::endl;
-    std::cout << GridLogMessage << "--basis_size : Select the basis size from 100,200,300,350 (default 100)" << std::endl;
-    Grid_finalize();
-    return 1;
-  }
-  opt.config = argv[1];
-  GridCmdOptionIntVector(argv[2], opt.GparityDirs);
-  assert(opt.GparityDirs.size() == 3);
-
-  for(int i=3;i<argc;i++){
-    std::string sarg = argv[i];
-    if(sarg == "--Ls"){
-      opt.Ls = std::stoi(argv[i+1]);
-      std::cout << GridLogMessage << "Set Ls to " << opt.Ls << std::endl;
-    }else if(sarg == "--mass"){
-      std::istringstream ss(argv[i+1]); ss >> opt.mass;
-      std::cout << GridLogMessage << "Set quark mass to " << opt.mass << std::endl;
-    }else if(sarg == "--block"){
-      GridCmdOptionIntVector(argv[i+1], opt.blockSize);
-      assert(opt.blockSize.size() == 5);
-      std::cout << GridLogMessage << "Set block size to ";
-      for(int q=0;q<5;q++) std::cout << opt.blockSize[q] << " ";
-      std::cout << std::endl;      
-    }else if(sarg == "--is_cps_cfg"){
-      opt.is_cps_cfg = true;
-    }else if(sarg == "--write_irl_templ"){
-      XmlWriter writer("irl_templ.xml");
-      write(writer,"Params", opt.fine);
-      Grid_finalize();
-      return 0;
-    }else if(sarg == "--read_irl_fine"){
-      std::cout << GridLogMessage << "Reading fine IRL params from " << argv[i+1] << std::endl;
-      XmlReader reader(argv[i+1]);
-      read(reader, "Params", opt.fine);
-    }else if(sarg == "--read_irl_coarse"){
-      std::cout << GridLogMessage << "Reading coarse IRL params from " << argv[i+1] << std::endl;
-      XmlReader reader(argv[i+1]);
-      read(reader, "Params", opt.coarse);
-    }else if(sarg == "--write_fine"){
-      opt.write_fine = true;
-      opt.write_fine_file = argv[i+1];
-    }else if(sarg == "--read_fine"){
-      opt.read_fine = true;
-      opt.read_fine_file = argv[i+1];
-    }else if(sarg == "--write_coarse"){
-      opt.write_coarse = true;
-      opt.write_coarse_file = argv[i+1];
-    }else if(sarg == "--read_coarse"){
-      opt.read_coarse = true;
-      opt.read_coarse_file = argv[i+1];
-    }else if(sarg == "--smoother_ord"){
-      std::istringstream ss(argv[i+1]); ss >> opt.smoother_ord;
-      std::cout << GridLogMessage << "Set smoother order to " << opt.smoother_ord << std::endl;
-    }else if(sarg == "--coarse_relax_tol"){
-      std::istringstream ss(argv[i+1]); ss >> opt.coarse_relax_tol;
-      std::cout << GridLogMessage << "Set coarse IRL relaxation parameter to " << opt.coarse_relax_tol << std::endl;
-    }else if(sarg == "--mobius_scale"){
-      std::istringstream ss(argv[i+1]); ss >> opt.mobius_scale;
-      std::cout << GridLogMessage << "Set Mobius scale to " << opt.mobius_scale << std::endl;
-    }else if(sarg == "--basis_size"){
-      basis_size = std::stoi(argv[i+1]);
-      std::cout << GridLogMessage << "Set basis size to " << basis_size << std::endl;
-    }
-  }
-
-  switch(basis_size){
-  case 100:
-    runTest<100>(opt); break;
-  case 200:
-    runTest<200>(opt); break;
-  case 300:
-    runTest<300>(opt); break;
-  case 350:
-    runTest<350>(opt); break;
-  default:
-    std::cout << GridLogMessage << "Unsupported basis size " << basis_size << std::endl;
-    assert(0);
-  }
-  
-  Grid_finalize();
-}
-
--- a/tests/lanczos/Test_evec_compression.cc
+++ b/tests/lanczos/Test_evec_compression.cc
@ -1,582 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_evec_compression.cc
-
-    Copyright (C) 2017
-
-Author: Christopher Kelly <ckelly@bnl.gov>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-/*
- *
- * This test generates eigenvectors using the Lanczos algorithm then attempts to use local coherence compression
- * to express those vectors in terms of a basis formed from a subset. This test is useful for finding the optimal
- * blocking and basis size for performing a Local Coherence Lanczos
- */
-#include <Grid/Grid.h>
-#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
-
-using namespace std;
-using namespace Grid;
-
-//For the CPS configurations we have to manually seed the RNG and deal with an incorrect factor of 2 in the plaquette metadata
-template<typename Gimpl>
-void readConfiguration(LatticeGaugeFieldD &U,
-		       const std::string &config,
-		       bool is_cps_cfg = false){
-
-  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = false;
-
-  typedef GaugeStatistics<Gimpl> GaugeStats;
-     
-  FieldMetaData header;
-  NerscIO::readConfiguration<GaugeStats>(U, header, config);
-
-  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = true;
-}
-
-//Lanczos parameters in CPS conventions
-struct CPSLanczosParams : Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(CPSLanczosParams,
-				  RealD, alpha,
-				  RealD, beta,
-				  int, ch_ord,
-				  int, N_use,
-				  int, N_get,
-				  int, N_true_get,
-				  RealD, stop_rsd,
-				  int, maxits);
-
-  //Translations
-  ChebyParams getChebyParams() const{
-    ChebyParams out;
-    out.alpha = beta*beta; //aka lo
-    out.beta = alpha*alpha; //aka hi
-    out.Npoly = ch_ord+1;
-    return out;
-  }
-  int Nstop() const{ return N_true_get; }
-  int Nm() const{ return N_use; }
-  int Nk() const{ return N_get; }
-};
-
-
-template<class Fobj,class CComplex,int nbasis>
-class LocalCoherenceCompressor{
-public:
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-  typedef Lattice<Fobj>                       FineField;
-  
-  void compress(std::vector<FineField> &basis,
-		std::vector<CoarseField> &compressed_evecs,
-		const std::vector<FineField> &evecs_in,
-		GridBase *FineGrid,
-		GridBase *CoarseGrid){
-    int nevecs = evecs_in.size();
-    assert(nevecs > nbasis);
-    
-    //Construct the basis
-    basis.resize(nbasis, FineGrid);
-    for(int b=0;b<nbasis;b++) basis[b] = evecs_in[b];
-
-    //Block othornormalize basis
-    CoarseScalar InnerProd(CoarseGrid);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,basis);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
-    blockOrthogonalise(InnerProd,basis);
-
-    //The coarse grid representation is the field of vectors of block inner products
-    std::cout << GridLogMessage << "Compressing eigevectors" << std::endl;
-    compressed_evecs.resize(nevecs, CoarseGrid);
-    for(int i=0;i<nevecs;i++) blockProject(compressed_evecs[i], evecs_in[i], basis);
-    std::cout << GridLogMessage << "Compression complete" << std::endl;
-  }
-
-  void uncompress(FineField &evec, const int i, const std::vector<FineField> &basis, const std::vector<CoarseField> &compressed_evecs) const{
-    blockPromote(compressed_evecs[i],evec,basis);  
-  }
-
-  //Test uncompressed eigenvectors of Linop.HermOp to precision 'base_tolerance' for i<nbasis and 'base_tolerance*relax' for i>=nbasis
-  //Because the uncompressed evec has a lot of high mode noise (unimportant for deflation) we apply a smoother before testing.
-  //The Chebyshev used by the Lanczos should be sufficient as a smoother
-  bool testCompression(LinearOperatorBase<FineField> &Linop, OperatorFunction<FineField>   &smoother,
-		       const std::vector<FineField> &basis, const std::vector<CoarseField> &compressed_evecs, const std::vector<RealD> &evals,
-		       const RealD base_tolerance, const RealD relax){
-    std::cout << GridLogMessage << "Testing quality of uncompressed evecs (after smoothing)" << std::endl;
-   
-    GridBase* FineGrid = basis[0].Grid();
-    GridBase* CoarseGrid = compressed_evecs[0].Grid();
-
-    bool fail = false;
-    FineField evec(FineGrid), Mevec(FineGrid), evec_sm(FineGrid);
-    for(int i=0;i<compressed_evecs.size();i++){
-      std::cout << GridLogMessage << "Uncompressing evec " << i << std::endl;
-      uncompress(evec, i, basis, compressed_evecs);
-
-      std::cout << GridLogMessage << "Smoothing evec " << i << std::endl;
-      smoother(Linop, evec, evec_sm);
-      
-      std::cout << GridLogMessage << "Computing residual for evec " << i << std::endl;
-      std::cout << GridLogMessage << "Linop" << std::endl;
-      Linop.HermOp(evec_sm, Mevec);
-      std::cout << GridLogMessage << "Linalg" << std::endl;
-      Mevec = Mevec - evals[i]*evec_sm;
-
-      std::cout << GridLogMessage << "Resid" << std::endl;
-      RealD tol = base_tolerance * (i<nbasis ? 1. : relax);
-      RealD res = sqrt(norm2(Mevec));
-      std::cout << GridLogMessage << "Evec idx " << i << " res " << res << " tol " << tol << std::endl;
-      if(res > tol) fail = true;
-    }
-    return fail;
-  }
-
-  //Compare uncompressed evecs to original evecs
-  void compareEvecs(const std::vector<FineField> &basis, const std::vector<CoarseField> &compressed_evecs, const std::vector<FineField> &orig_evecs){
-    std::cout << GridLogMessage << "Comparing uncompressed evecs to original evecs" << std::endl;
-    
-    GridBase* FineGrid = basis[0].Grid();
-    GridBase* CoarseGrid = compressed_evecs[0].Grid();
-
-    FineField evec(FineGrid), diff(FineGrid);
-    for(int i=0;i<compressed_evecs.size();i++){
-      std::cout << GridLogMessage << "Uncompressing evec " << i << std::endl;
-      uncompress(evec, i, basis, compressed_evecs);
-      diff = orig_evecs[i] - evec;
-      RealD res = sqrt(norm2(diff));
-      std::cout << GridLogMessage << "Evec idx " << i << " res " << res << std::endl;
-    }
-  }
-  
-};
-
-template<class Fobj,class CComplex,int nbasis>
-void compareBlockPromoteTimings(const std::vector<Lattice<Fobj> > &basis, const std::vector<Lattice<iVector<CComplex,nbasis > > > &compressed_evecs){
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CComplex>                   CoarseScalar; 
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-  typedef Lattice<Fobj>                       FineField;
-
-  GridStopWatch timer;
-  
-  GridBase* FineGrid = basis[0].Grid();
-  GridBase* CoarseGrid = compressed_evecs[0].Grid();
-
-  FineField v1(FineGrid), v2(FineGrid);
-
-  //Start with a cold start
-  for(int i=0;i<basis.size();i++){
-    autoView( b_ , basis[i], CpuWrite);
-  }
-  for(int i=0;i<compressed_evecs.size();i++){
-    autoView( b_ , compressed_evecs[i], CpuWrite);
-  }
-  {
-    autoView( b_, v1, CpuWrite );
-  }
-
-  timer.Start();
-  blockPromote(compressed_evecs[0],v1,basis);  
-  timer.Stop();
-  std::cout << GridLogMessage << "Time for cold blockPromote v1 " << timer.Elapsed() << std::endl;
-
-  //Test to ensure it is actually doing a cold start by repeating
-  for(int i=0;i<basis.size();i++){
-    autoView( b_ , basis[i], CpuWrite);
-  }
-  for(int i=0;i<compressed_evecs.size();i++){
-    autoView( b_ , compressed_evecs[i], CpuWrite);
-  }
-  {
-    autoView( b_, v1, CpuWrite );
-  }
-
-  timer.Reset();
-  timer.Start();
-  blockPromote(compressed_evecs[0],v1,basis);  
-  timer.Stop();
-  std::cout << GridLogMessage << "Time for cold blockPromote v1 repeat (should be the same as above) " << timer.Elapsed() << std::endl;
-}
-
-struct Args{
-  int Ls;
-  RealD mass;
-  RealD M5;
-  bool is_cps_cfg;
-  RealD mobius_scale; //b+c
-  
-  CPSLanczosParams fine;
-  double coarse_relax_tol;
-
-  std::vector<int> blockSize;
-  std::vector<int> GparityDirs;
-
-  bool write_fine;
-  std::string write_fine_file;
-  bool read_fine;
-  std::string read_fine_file;
-
-  int basis_size;
-  
-  Args(){
-    blockSize = {2,2,2,2,2};
-    GparityDirs = {1,1,1}; //1 for each GP direction
-    
-    Ls = 12;
-    mass = 0.01;
-    M5 = 1.8;
-    is_cps_cfg = false;
-    mobius_scale = 2;
-    
-    fine.alpha = 2;
-    fine.beta = 0.1;
-    fine.ch_ord = 100;
-    fine.N_use = 70;
-    fine.N_get = 60;
-    fine.N_true_get = 60;
-    fine.stop_rsd = 1e-8;
-    fine.maxits = 10000;
-
-    coarse_relax_tol = 1e5;
-
-    write_fine = false;
-    read_fine = false;
-
-    basis_size = 100;
-  }
-};
-    
-
-GparityWilsonImplD::ImplParams setupGparityParams(const std::vector<int> &GparityDirs){
-  //Setup G-parity BCs
-  assert(Nd == 4);
-  std::vector<int> dirs4(4);
-  for(int i=0;i<3;i++) dirs4[i] = GparityDirs[i];
-  dirs4[3] = 0; //periodic gauge BC in time
-  
-  std::cout << GridLogMessage << "Gauge BCs: " << dirs4 << std::endl;
-  ConjugateGimplD::setDirections(dirs4); //gauge BC
-
-  GparityWilsonImplD::ImplParams Params;
-  for(int i=0;i<Nd-1;i++) Params.twists[i] = GparityDirs[i]; //G-parity directions
-  Params.twists[Nd-1] = 1; //APBC in time direction
-  std::cout << GridLogMessage << "Fermion BCs: " << Params.twists << std::endl;
-  return Params;
-}
-
-WilsonImplD::ImplParams setupParams(){
-  WilsonImplD::ImplParams Params;
-  Complex one(1.0);
-  Complex mone(-1.0);
-  for(int i=0;i<Nd-1;i++) Params.boundary_phases[i] = one;
-  Params.boundary_phases[Nd-1] = mone;
-  return Params;
-}
-
-template<int nbasis, typename ActionType>
-void run_b(ActionType &action, const std::string &config, const Args &args){
-  //Fine grids
-  GridCartesian         * UGrid     = (GridCartesian*)action.GaugeGrid();
-  GridRedBlackCartesian * UrbGrid   = (GridRedBlackCartesian*)action.GaugeRedBlackGrid();
-  GridCartesian         * FGrid     = (GridCartesian*)action.FermionGrid();
-  GridRedBlackCartesian * FrbGrid   = (GridRedBlackCartesian*)action.FermionRedBlackGrid();
-
-  //Setup the coarse grids  
-  auto fineLatt     = GridDefaultLatt();
-  Coordinate coarseLatt(4);
-  for (int d=0;d<4;d++){
-    coarseLatt[d] = fineLatt[d]/args.blockSize[d];    assert(coarseLatt[d]*args.blockSize[d]==fineLatt[d]);
-  }
-
-  std::cout << GridLogMessage<< " 5d coarse lattice is ";
-  for (int i=0;i<4;i++){
-    std::cout << coarseLatt[i]<<"x";
-  } 
-  int cLs = args.Ls/args.blockSize[4]; assert(cLs*args.blockSize[4]==args.Ls);
-  std::cout << cLs<<std::endl;
-  
-  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
-  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
-  typedef vTComplex CComplex; 
-  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
-  typedef Lattice<CComplex>                   CoarseScalar;
-  typedef Lattice<CoarseSiteVector>           CoarseField;
-
-  typedef typename ActionType::FermionField FermionField; 
-  
-  SchurDiagTwoOperator<ActionType,FermionField> SchurOp(action);
-
-  typedef typename ActionType::SiteSpinor SiteSpinor;
-
-  const CPSLanczosParams &fine = args.fine;
-  
-  //Do the fine Lanczos
-  std::vector<RealD> evals;
-  std::vector<FermionField> evecs;
-
-  if(args.read_fine){
-    evals.resize(fine.N_true_get);
-    evecs.resize(fine.N_true_get, FrbGrid);
-
-    std::string evals_file = args.read_fine_file + "_evals.xml";
-    std::string evecs_file = args.read_fine_file + "_evecs.scidac";
-    
-    std::cout << GridLogIRL<< "Reading evals from "<<evals_file<<std::endl;
-    XmlReader RDx(evals_file);
-    read(RDx,"evals",evals);
-    
-    assert(evals.size()==fine.N_true_get);
-    
-    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
-    emptyUserRecord record;
-    Grid::ScidacReader RD ;
-    RD.open(evecs_file);
-    for(int k=0;k<fine.N_true_get;k++) {
-      evecs[k].Checkerboard()=Odd;
-      RD.readScidacFieldRecord(evecs[k],record);
-      
-    }
-    RD.close();
-  }else{ 
-    int Nstop = fine.Nstop(); //==N_true_get
-    int Nm = fine.Nm();
-    int Nk = fine.Nk();
-    RealD resid = fine.stop_rsd;
-    int MaxIt = fine.maxits;
-    
-    assert(nbasis<=Nm);    
-    Chebyshev<FermionField>      Cheby(fine.getChebyParams());
-    FunctionHermOp<FermionField> ChebyOp(Cheby,SchurOp);
-    PlainHermOp<FermionField>    Op(SchurOp);
-
-    evals.resize(Nm);
-    evecs.resize(Nm,FrbGrid);
-    
-    ImplicitlyRestartedLanczos<FermionField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,0,0);
-
-    FermionField src(FrbGrid); 
-    typedef typename FermionField::scalar_type Scalar;
-    src=Scalar(1.0); 
-    src.Checkerboard() = Odd;
-
-    int Nconv;
-    IRL.calc(evals, evecs,src,Nconv,false);
-    if(Nconv < Nstop) assert(0 && "Fine lanczos failed to converge the required number of evecs"); //algorithm doesn't consider this a failure
-    if(Nconv > Nstop){
-      //Yes this potentially throws away some evecs but it is better than having a random number of evecs between Nstop and Nm!
-      evals.resize(Nstop);
-      evecs.resize(Nstop, FrbGrid);
-    }
-    
-    if(args.write_fine){
-      std::string evals_file = args.write_fine_file + "_evals.xml";
-      std::string evecs_file = args.write_fine_file + "_evecs.scidac";
-
-      std::cout << GridLogIRL<< "Writing evecs to "<<evecs_file<<std::endl;
-
-      emptyUserRecord record;
-      Grid::ScidacWriter WR(FrbGrid->IsBoss());
-      WR.open(evecs_file);
-      for(int k=0;k<evecs.size();k++) {
-	WR.writeScidacFieldRecord(evecs[k],record);
-      }
-      WR.close();
-
-      std::cout << GridLogIRL<< "Writing evals to "<<evals_file<<std::endl;
-      
-      XmlWriter WRx(evals_file);
-      write(WRx,"evals",evals);
-    }    
-  }
-    
-  //Do the compression
-  LocalCoherenceCompressor<SiteSpinor,vTComplex,nbasis> compressor;
-  std::vector<FermionField> basis(nbasis,FrbGrid);
-  std::vector<CoarseField> compressed_evecs(evecs.size(),CoarseGrid5);
-  
-  compressor.compress(basis, compressed_evecs, evecs, FrbGrid, CoarseGrid5);
-
-  compareBlockPromoteTimings(basis, compressed_evecs);
-
-  //Compare uncompressed and original evecs
-  compressor.compareEvecs(basis, compressed_evecs, evecs);
-  
-  //Create the smoother
-  Chebyshev<FermionField> smoother(fine.getChebyParams());
-  
-  //Test the quality of the uncompressed evecs
-  assert( compressor.testCompression(SchurOp, smoother, basis, compressed_evecs, evals, fine.stop_rsd, args.coarse_relax_tol) );   
-}
-
-template<typename ActionType>
-void run(ActionType &action, const std::string &config, const Args &args){
-  switch(args.basis_size){
-  case 50:
-    return run_b<50>(action,config,args);
-  case 100:
-    return run_b<100>(action,config,args);
-  case 150:
-    return run_b<150>(action,config,args);
-  case 200:
-    return run_b<200>(action,config,args);
-  case 250:
-    return run_b<250>(action,config,args);
-  case 300:
-    return run_b<300>(action,config,args);
-  case 350:
-    return run_b<350>(action,config,args);
-  case 400:
-    return run_b<400>(action,config,args);
-  default:
-    assert(0 && "Unsupported basis size: allowed values are 50,100,200,250,300,350,400");
-  }
-}
-
-
-
-
-//Note:  because we rely upon physical properties we must use a "real" gauge configuration
-int main (int argc, char ** argv) {
-  Grid_init(&argc,&argv);
-  GridLogIRL.TimingMode(1);
-
-  if(argc < 3){
-    std::cout << GridLogMessage << "Usage: <exe> <config file> <gparity dirs> <options>" << std::endl;
-    std::cout << GridLogMessage << "<gparity dirs> should have the format a.b.c where a,b,c are 0,1 depending on whether there are G-parity BCs in that direction" << std::endl;
-    std::cout << GridLogMessage << "Options:" << std::endl;
-    std::cout << GridLogMessage << "--Ls <value> : Set Ls (default 12)" << std::endl;
-    std::cout << GridLogMessage << "--mass <value> : Set the mass (default 0.01)" << std::endl;
-    std::cout << GridLogMessage << "--block <value> : Set the block size. Format should be a.b.c.d.e where a-e are the block extents  (default 2.2.2.2.2)" << std::endl;
-    std::cout << GridLogMessage << "--is_cps_cfg : Indicate that the configuration was generated with CPS where until recently the stored plaquette was wrong by a factor of 2" << std::endl;
-    std::cout << GridLogMessage << "--write_irl_templ: Write a template for the parameters file of the Lanczos to \"irl_templ.xml\"" << std::endl;
-    std::cout << GridLogMessage << "--read_irl_fine <filename>: Real the parameters file for the fine Lanczos" << std::endl;
-    std::cout << GridLogMessage << "--write_fine <filename stub>: Write fine evecs/evals to filename starting with the stub" << std::endl;
-    std::cout << GridLogMessage << "--read_fine <filename stub>: Read fine evecs/evals from filename starting with the stub" << std::endl;    
-    std::cout << GridLogMessage << "--coarse_relax_tol : Set the relaxation parameter for evaluating the residual of the reconstructed eigenvectors outside of the basis (default 1e5)" << std::endl;
-    std::cout << GridLogMessage << "--action : Set the action from 'DWF', 'Mobius'  (default Mobius)" << std::endl;
-    std::cout << GridLogMessage << "--mobius_scale : Set the Mobius scale b+c (default 2)" << std::endl;
-    std::cout << GridLogMessage << "--basis_size : Set the basis size from 50,100,150,200,250,300,350,400 (default 100)" << std::endl;
-
-    Grid_finalize();
-    return 1;
-  }
-  std::string config = argv[1];
-
-  Args args;
-  GridCmdOptionIntVector(argv[2], args.GparityDirs);
-  assert(args.GparityDirs.size() == 3);
-
-  std::string action_s = "Mobius"; 
-  
-  for(int i=3;i<argc;i++){
-    std::string sarg = argv[i];
-    if(sarg == "--Ls"){
-      args.Ls = std::stoi(argv[i+1]);
-      std::cout << GridLogMessage << "Set Ls to " << args.Ls << std::endl;
-    }else if(sarg == "--mass"){
-      std::istringstream ss(argv[i+1]); ss >> args.mass;
-      std::cout << GridLogMessage << "Set quark mass to " << args.mass << std::endl;
-    }else if(sarg == "--block"){
-      GridCmdOptionIntVector(argv[i+1], args.blockSize);
-      assert(args.blockSize.size() == 5);
-      std::cout << GridLogMessage << "Set block size to ";
-      for(int q=0;q<5;q++) std::cout << args.blockSize[q] << " ";
-      std::cout << std::endl;      
-    }else if(sarg == "--is_cps_cfg"){
-      args.is_cps_cfg = true;
-    }else if(sarg == "--write_irl_templ"){
-      XmlWriter writer("irl_templ.xml");
-      write(writer,"Params",args.fine);
-      Grid_finalize();
-      return 0;
-    }else if(sarg == "--read_irl_fine"){
-      std::cout << GridLogMessage << "Reading fine IRL params from " << argv[i+1] << std::endl;
-      XmlReader reader(argv[i+1]);
-      read(reader, "Params", args.fine);
-    }else if(sarg == "--write_fine"){
-      args.write_fine = true;
-      args.write_fine_file = argv[i+1];
-    }else if(sarg == "--read_fine"){
-      args.read_fine = true;
-      args.read_fine_file = argv[i+1];
-    }else if(sarg == "--coarse_relax_tol"){
-      std::istringstream ss(argv[i+1]); ss >> args.coarse_relax_tol;
-      std::cout << GridLogMessage << "Set coarse IRL relaxation parameter to " << args.coarse_relax_tol << std::endl;
-    }else if(sarg == "--action"){
-      action_s = argv[i+1];
-      std::cout << "Action set to " << action_s << std::endl;
-    }else if(sarg == "--mobius_scale"){
-      std::istringstream ss(argv[i+1]); ss >> args.mobius_scale;
-      std::cout << GridLogMessage << "Set Mobius scale to " << args.mobius_scale << std::endl;
-    }else if(sarg == "--basis_size"){
-      args.basis_size = std::stoi(argv[i+1]);
-      std::cout << GridLogMessage << "Set basis size to " << args.basis_size << std::endl;
-    }
-  }
-  
-  //Fine grids
-  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),  GridDefaultSimd(Nd,vComplex::Nsimd()),   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(args.Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(args.Ls,UGrid);
-
-  LatticeGaugeField Umu(UGrid);  
-  
-  bool is_gparity = false;
-  for(auto g : args.GparityDirs) if(g) is_gparity = true;
-
-  double bmc =  1.;      
-  double b = (args.mobius_scale + bmc)/2.;  // b = 1/2 [ (b+c) + (b-c) ]
-  double c = (args.mobius_scale - bmc)/2.;  // c = 1/2 [ (b+c) - (b-c) ]
-    
-  if(is_gparity){
-    GparityWilsonImplD::ImplParams Params = setupGparityParams(args.GparityDirs);
-    readConfiguration<ConjugateGimplD>(Umu, config, args.is_cps_cfg);   //Read the gauge field
-    
-    if(action_s == "DWF"){    
-      GparityDomainWallFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, Params);
-      run(action, config, args);
-    }else if(action_s == "Mobius"){
-      GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params);
-      run(action, config, args);	    
-    }      
-  }else{
-    WilsonImplD::ImplParams Params = setupParams();
-    readConfiguration<PeriodicGimplD>(Umu, config, args.is_cps_cfg);   //Read the gauge field
-    
-    if(action_s == "DWF"){    
-      DomainWallFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, Params);
-      run(action, config, args);
-    }else if(action_s == "Mobius"){
-      MobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params);
-      run(action, config, args);	    
-    }
-  } 
-  
-  Grid_finalize();
-}
Author	SHA1	Message	Date
Peter Boyle	95b640cb6b	10TF/s on 32^3 x 64 on single node	2022-08-04 15:43:52 -04:00
Peter Boyle	2cb5bedc15	Copy stream HIP improvements	2022-08-04 15:24:03 -04:00
Peter Boyle	806b02bddf	Simplify dead code	2022-08-04 15:23:13 -04:00
Peter Boyle	de40395773	More timing. Think I should start to use nvtx and rocmtx ??	2022-08-04 13:37:16 -04:00
Peter Boyle	7ba4788715	Fix	2022-08-04 13:36:44 -04:00
Peter Boyle	06d9ce1a02	Synch ranks on node here for GPU - GPU memcopy	2022-08-04 13:35:56 -04:00
Peter Boyle	75bb6b2b40	Move barrier into the StencilSend begin routine	2022-08-04 13:35:26 -04:00
Peter Boyle	74f10c2dc0	Move barrier into Stencil Send	2022-08-04 13:34:11 -04:00
Peter Boyle	a93d5459d4	Better mpi request completion	2022-07-28 12:18:35 -04:00
Peter Boyle	9c21add0c6	High res timer replaces getttimeofday	2022-07-28 12:14:03 -04:00
Peter Boyle	639aab6563	High res timer instead of gettimeofday	2022-07-28 12:13:35 -04:00
Peter Boyle	8137cc7049	Allways concurrent comms	2022-07-28 12:01:51 -04:00
Peter Boyle	60e63dca1d	Add memory logging channel	2022-07-28 11:39:15 -04:00
Peter Boyle	486409574e	Expanded cach to avoid any allocs in HMC	2022-07-28 11:38:34 -04:00
Peter Boyle	a913b8be12	Dslash self timing. Might want to not have this	2022-07-28 11:37:55 -04:00
Peter Boyle	2239751850	Better logging	2022-07-28 11:37:36 -04:00
Peter Boyle	9b20f1449c	Better timing	2022-07-28 11:37:12 -04:00
Peter Boyle	b99453083d	Updated timing	2022-07-28 11:37:02 -04:00
Peter Boyle	943fbb914d	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet	2022-07-11 13:48:42 -04:00
Peter Boyle	ca4603580d	Verbose	2022-07-11 13:48:35 -04:00
Peter Boyle	f73db8f1f3	Synch clocks	2022-07-11 13:47:39 -04:00
Peter Boyle	f7217d12d2	World barrier for clock synch	2022-07-11 13:45:31 -04:00
Peter Boyle	fab50c57d9	More loggin	2022-07-11 18:42:27 +01:00
Peter Boyle	3440534fbf	MixedPrec support	2022-07-10 21:35:18 +01:00
Peter Boyle	177b1a7ec6	Mixed prec	2022-07-10 21:34:10 +01:00
Peter Boyle	58182fe345	Different approach to default dirichlet params	2022-07-10 21:32:58 +01:00
Peter Boyle	1f907d330d	Different default params for dirichlet	2022-07-10 21:31:48 +01:00
Peter Boyle	b0fe664e9d	Better force log info	2022-07-10 21:31:25 +01:00
Peter Boyle	c0f8482402	Remove SSC marks	2022-07-07 17:49:36 +01:00
Peter Boyle	3544965f54	Stream doesn't work	2022-07-07 17:49:20 +01:00