Warning fixes

Tracing
Merge branch 'feature/dirichlet-gparity' into feature/dirichlet
2025-06-21 17:22:03 +01:00 · 2022-08-31 19:01:14 -04:00 · 2022-08-31 18:31:46 -04:00 · 2022-08-31 18:25:34 -04:00 · 2022-08-31 18:22:50 -04:00 · 2022-08-31 17:35:32 -04:00
52 changed files with 4418 additions and 853 deletions
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@ -44,7 +44,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
-#include <Grid/perfmon/PerfCount.h>
+#include <Grid/perfmon/Tracing.h>
+//#include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
 #include <Grid/allocator/Allocator.h>
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -58,6 +58,7 @@ public:

  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {

+    GRID_TRACE("ConjugateGradient");
    psi.Checkerboard() = src.Checkerboard();

    conformable(psi, src);
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -84,6 +84,7 @@ public:

  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
  {
+    GRID_TRACE("ConjugateGradientMultiShift");
  
    GridBase *grid = src.Grid();
  
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -127,6 +127,7 @@ public:

  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
+    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();

    ////////////////////////////////////////////////////////////////////////
--- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
@ -73,6 +73,7 @@ public:
  }
    
  void operator()(const FieldD &src, FieldD &psi) {
+    GRID_TRACE("ConjugateGradientReliableUpdate");
    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
    bool using_fallback = false;
      
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@ -146,14 +146,21 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
+
+  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
+                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
+                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
+                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
+                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3) 
+					   RealD coarse_relax_tol=5.0e3,
+					   int largestEvalIdxForReport=-1) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol)  
+      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
  {    };

  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
@ -179,6 +186,12 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;

+    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
+      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
+      RealD tmp_eval;
+      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
+    }
+    
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
@ -409,7 +422,7 @@ public:
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -110,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -118,7 +118,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -132,7 +132,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Evict(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
@ -143,7 +143,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -156,7 +156,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -171,7 +171,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@ -247,7 +247,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
+    dprintf("ViewOpen found entry %lx %lx : %ld %ld\n",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -523,7 +523,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  }
  if ( WorldRank == 0 ){
    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  std::cout<< "Setting up IPC"<<std::endl;
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
@ -53,6 +54,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -70,6 +72,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -86,6 +89,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -106,6 +110,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@ -119,6 +124,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -133,6 +139,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -146,6 +153,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@ -163,6 +171,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mult");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -177,6 +186,7 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mac");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -191,6 +201,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("sub");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -204,6 +215,7 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("add");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -218,6 +230,7 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class sobj,class vobj> inline
 void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+  GRID_TRACE("axpy");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@ -231,6 +244,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 }
 template<class sobj,class vobj> inline
 void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+  GRID_TRACE("axpby");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@ -246,11 +260,13 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
+  GRID_TRACE("axpy_norm");
    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
+  GRID_TRACE("axpby_norm");
    return axpby_norm_fast(ret,a,b,x,y);
 }

--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -117,6 +117,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
+    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@ -140,6 +141,7 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
+    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@ -163,6 +165,7 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
+    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -488,6 +488,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
+template<class vobj> inline
+std::vector<typename vobj::scalar_object> 
+sliceSum(const Lattice<vobj> &Data,int orthogdim)
+{
+  std::vector<typename vobj::scalar_object> result;
+  sliceSum(Data,result,orthogdim);
+  return result;
+}

 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@ -0,0 +1,66 @@
+#pragma once
+#ifdef GRID_TRACING_NVTX
+#include <nvToolsExt.h>
+class GridTracer {
+public:
+  GridTracer(const char* name) {
+    nvtxRangePushA(name);
+  }
+  ~GridTracer() {
+    nvtxRangePop();
+  }
+};
+inline void tracePush(const char *name) { nvtxRangePushA(name); }
+inline void tracePop(const char *name) { nvtxRangePop(); }
+inline int  traceStart(const char *name) {  }
+inline void traceStop(int ID) {  }
+#endif
+
+#ifdef GRID_TRACING_ROCTX
+#include <roctracer/roctx.h>
+class GridTracer {
+ public:
+  GridTracer(const char* name) {
+    roctxRangePushA(name);
+    std::cout << "roctxRangePush "<<name<<std::endl;
+  }
+  ~GridTracer() {
+    roctxRangePop();
+    std::cout << "roctxRangePop "<<std::endl;
+  }
+};
+inline void tracePush(const char *name) { roctxRangePushA(name); }
+inline void tracePop(const char *name) { roctxRangePop(); }
+inline int  traceStart(const char *name) { roctxRangeStart(name); }
+inline void traceStop(int ID) { roctxRangeStop(ID); }
+#endif
+
+#ifdef GRID_TRACING_TIMER
+class GridTracer {
+ public:
+  const char *name;
+  double elapsed;
+  GridTracer(const char* _name) {
+    name = _name;
+    elapsed=-usecond();
+  }
+  ~GridTracer() {
+    elapsed+=usecond();
+    std::cout << GridLogTracing << name << " took " <<elapsed<< " us" <<std::endl;
+  }
+};
+inline void tracePush(const char *name) {  }
+inline void tracePop(const char *name) {  }
+inline int  traceStart(const char *name) { return 0; }
+inline void traceStop(int ID) {  }
+#endif
+
+#ifdef GRID_TRACING_NONE
+#define GRID_TRACE(name) 
+inline void tracePush(const char *name) {  }
+inline void tracePop(const char *name) {  }
+inline int  traceStart(const char *name) { return 0;  }
+inline void traceStop(int ID) {  }
+#else
+#define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
+#endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -52,10 +52,13 @@ public:
    deriv_us = S_us = refresh_us = 0.0;
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
+    Fdt_max_sum =  Fdt_norm_sum = 0.0;
  }
  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
-    deriv_max_sum+=max; deriv_norm_sum+=nrm;
-    Fdt_max_sum+=Fdt_max; Fdt_norm_sum+=Fdt_nrm; deriv_num++;
+    deriv_max_sum+=max;
+    deriv_norm_sum+=nrm;
+    Fdt_max_sum+=Fdt_max;
+    Fdt_norm_sum+=Fdt_nrm; deriv_num++;
  }
  RealD deriv_max_average(void)       { return deriv_max_sum/deriv_num; };
  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
@ -73,6 +76,7 @@ public:
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
+  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -183,16 +183,6 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-  void CayleyReport(void);
-  void CayleyZeroCounters(void);
-
-  double M5Dflops;
-  double M5Dcalls;
-  double M5Dtime;
-
-  double MooeeInvFlops;
-  double MooeeInvCalls;
-  double MooeeInvTime;

 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  ////////////////////////////////////////
-  // Performance monitoring
-  ////////////////////////////////////////
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopTotalTime;
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -52,18 +52,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  ////////////////////////////////////////
-  // Performance monitoring
-  ////////////////////////////////////////
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopTotalTime;
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  ////////////////////////////////////////
-  // Performance monitoring
-  ////////////////////////////////////////
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopTotalTime;
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -294,11 +294,7 @@ public:
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;

-  void ZeroCountersi(void)  {  }
-  void Reporti(int calls)  {  }
-
  //  Vector<int> surface_list;
-
  WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
@ -306,7 +302,6 @@ public:
 		const std::vector<int> &distances,Parameters p)  
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
-    ZeroCountersi();
    //    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -74,20 +74,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  void Report(void);
-  void ZeroCounters(void);
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-  double DhopTotalTime;
-
-  double DerivCalls;
-  double DerivCommTime;
-  double DerivComputeTime;
-  double DerivDhopComputeTime;
-
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -78,21 +78,6 @@ public:
  int Dirichlet;
  Coordinate Block; 

-  /********** Deprecate timers **********/
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-  double DhopTotalTime;
-
-  double DerivCalls;
-  double DerivCommTime;
-  double DerivComputeTime;
-  double DerivDhopComputeTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -152,58 +152,6 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  }
 }

-template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
-{
-  this->Report();
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP     = this->_FourDimGrid->_Nprocessors;
-  if ( M5Dcalls > 0 ) {
-    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
-
-    // Flops = 10.0*(Nc*Ns) *Ls*vol
-    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-
-    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
-    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
-    // write = 1
-    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
-    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
-  }
-
-  if ( MooeeInvCalls > 0 ) {
-
-    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-#ifdef GRID_CUDA
-    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#else
-    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
-    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#endif
-  }
-
-}
-template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
-{
-  this->ZeroCounters();
-  M5Dflops=0;
-  M5Dcalls=0;
-  M5Dtime=0;
-  MooeeInvFlops=0;
-  MooeeInvCalls=0;
-  MooeeInvTime=0;
-}
-
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
@ -646,7 +594,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
  
-#if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@ -765,7 +712,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
    else          q_out +=     C;
    
  }
-#endif
+
 }

 template <class Impl>
@ -832,7 +779,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif

-#if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
@ -952,7 +898,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

    InsertSlice(L_Q, q_out, s , 0);
  }
-#endif
 }
 #undef Pp
 #undef Pm
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -63,9 +63,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,

  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t s = sss%Ls;
@ -78,7 +75,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
    spProj5p(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
  });
-  M5Dtime+=usecond();
 }

 template<class Impl>  
@ -104,9 +100,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  int Ls=this->Ls;

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t s = sss%Ls;
@ -119,7 +112,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
    spProj5m(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
  });
-  M5Dtime+=usecond();
 }

 template<class Impl>
@ -140,8 +132,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  auto pleem = & leem[0];
  auto pueem = & ueem[0];

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -178,8 +168,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
-
-  MooeeInvTime+=usecond();
  
 }

@ -202,10 +190,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi

  assert(psi.Checkerboard() == psi.Checkerboard());

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -242,7 +226,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
-  MooeeInvTime+=usecond();

 }

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@ -94,10 +94,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}

-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
  assert(Nc==3);

  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
@ -198,7 +194,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
    }
 #endif
  });
-  M5Dtime+=usecond();
 }

 template<class Impl>  
@ -242,8 +237,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}

-  M5Dcalls++;
-  M5Dtime-=usecond();
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
    alignas(64) SiteHalfSpinor hp;
@ -339,7 +332,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
    }
 #endif
  });
-  M5Dtime+=usecond();
 }


@ -813,9 +805,6 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
  }
  assert(_Matp->size()==Ls*LLs);

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
  if ( switcheroo<Coeff_t>::iscomplex() ) {
    thread_loop( (auto site=0;site<vol;site++),{
      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
@ -825,7 +814,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    });
  }
-  MooeeInvTime+=usecond();
+
 }

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -54,8 +54,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  auto pupper = &upper[0];
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
  
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -71,7 +69,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -91,8 +88,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  auto plower = &lower[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();

  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -108,7 +103,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -127,8 +121,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  auto pleem = & this->leem[0];
  auto pueem = & this->ueem[0];

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -164,7 +156,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
      coalescedWrite(chi[ss+s],res);
    }
  });
-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -185,8 +176,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion

  assert(psi.Checkerboard() == psi.Checkerboard());

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
  auto nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -223,7 +212,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
    }
  });

-  this->MooeeInvTime += usecond();
 }

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@ -298,45 +298,33 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();

-  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
-  DhopFaceTime+=usecond();

-  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
-  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  DhopFaceTime+=usecond();

  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime-=usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime+=usecond();

-  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();

  st.CommunicateComplete(requests);
-  DhopCommTime +=usecond();

-  DhopComputeTime2-=usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime2+=usecond();
 }

 template<class Impl>
@ -347,22 +335,14 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];

- //double t1=usecond();
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
  
-  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-
 }
 /*CHANGE END*/

@ -371,7 +351,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -383,7 +362,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -395,7 +373,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());

@ -404,58 +381,6 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }

-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Report(void) 
-{
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _FourDimGrid->_Nprocessors;
-  RealD NN = _FourDimGrid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _FourDimGrid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime    = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@ -334,7 +334,6 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@ -346,7 +345,6 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -359,7 +357,6 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -418,47 +415,33 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  Compressor compressor; 
  int len =  U.Grid()->oSites();

-  DhopTotalTime   -= usecond();
-
-  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
-  DhopFaceTime    += usecond();

-  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

-  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
-  DhopFaceTime+= usecond();

  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime    += usecond();

  st.CommunicateComplete(requests);
-  DhopCommTime +=usecond();

  // First to enter, last to leave timing
-  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
-  DhopFaceTime    -= usecond();

-  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime2    += usecond();
 }


@ -471,78 +454,16 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));

-  DhopTotalTime   -= usecond();
-
-  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
-  DhopCommTime    += usecond();

-  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
 };

-  ////////////////////////////////////////////////////////////////
-  // Reporting
-  ////////////////////////////////////////////////////////////////
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::Report(void) 
-{
-  Coordinate latt = _grid->GlobalDimensions();
-  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _grid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime   = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -55,9 +55,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  auto plower = &lower[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -73,7 +70,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -99,9 +95,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  auto pshift_coeffs = &shift_coeffs[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -122,7 +115,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -143,9 +135,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  auto plower = &lower[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(), {
    uint64_t ss = sss*Ls;
@ -161,8 +150,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });
-
-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -186,9 +173,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  auto pshift_coeffs = &shift_coeffs[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  auto pm = this->pm;

  int nloop = grid->oSites()/Ls;
@ -217,7 +201,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -237,9 +220,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &

  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -277,7 +257,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
    }
  });
   
-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -297,8 +276,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  auto pueem= & this->ueem[0];
  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -343,7 +320,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
      }
  });

-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -363,9 +339,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  auto pleem= & this->leem[0];
  auto pueem= & this->ueem[0];

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -402,7 +375,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
      coalescedWrite(chi[ss+s],res);
    }
  });
-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -423,9 +395,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
      uint64_t ss=sss*Ls;
@ -469,7 +438,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
      }
  });

-  this->MooeeInvTime += usecond();
 }

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@ -263,7 +263,6 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@ -275,7 +274,6 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -288,7 +286,6 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -345,47 +342,33 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  Compressor compressor; 
  int len =  U.Grid()->oSites();

-  DhopTotalTime   -= usecond();
-
-  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
-  DhopFaceTime    += usecond();

-  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

-  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
-  DhopFaceTime+= usecond();

  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
-  DhopComputeTime    += usecond();

  st.CommunicateComplete(requests);
-  DhopCommTime +=usecond();

  // First to enter, last to leave timing
-  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
-  DhopFaceTime    -= usecond();

-  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
-  DhopComputeTime2    += usecond();
 }

 template <class Impl>
@ -396,78 +379,16 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));

-  DhopTotalTime   -= usecond();
-
-  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
-  DhopCommTime    += usecond();

-  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
 };

-  ////////////////////////////////////////////////////////////////
-  // Reporting
-  ////////////////////////////////////////////////////////////////
-template<class Impl>
-void NaiveStaggeredFermion<Impl>::Report(void) 
-{
-  Coordinate latt = _grid->GlobalDimensions();
-  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _grid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime   = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -103,8 +103,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    Block = block;
  }

-  ZeroCounters();
-
  if (Impl::LsVectorised) { 

    int nsimd = Simd::Nsimd();
@ -143,89 +141,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
   //                       <<" " << StencilEven.surface_list.size()<<std::endl;

 }
-     
-template<class Impl>
-void WilsonFermion5D<Impl>::Report(void)
-{
-  RealD NP     = _FourDimGrid->_Nprocessors;
-  RealD NN     = _FourDimGrid->NodeCount();
-  RealD volume = Ls;  
-  Coordinate latt = _FourDimGrid->GlobalDimensions();
-  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-
-  if ( DhopCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls   : " << DhopCalls   << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
-
-    // Average the compute time
-    _FourDimGrid->GlobalSum(DhopComputeTime);
-    DhopComputeTime/=NP;
-    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-
-    RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-   }
-
-  if ( DerivCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
-    
-    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-
-    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
-
-  if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
-  }
-  if ( DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
-  }
-}
-
-template<class Impl>
-void WilsonFermion5D<Impl>::ZeroCounters(void) {
-  DhopCalls       = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopComputeTime2= 0;
-  DhopFaceTime    = 0;
-  DhopTotalTime   = 0;
-
-  DerivCalls       = 0;
-  DerivCommTime    = 0;
-  DerivComputeTime = 0;
-  DerivDhopComputeTime = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-  Stencil.ZeroCountersi();
-  StencilEven.ZeroCountersi();
-  StencilOdd.ZeroCountersi();
-}
-

 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -281,7 +196,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  const FermionField &B,
 					  int dag)
 {
-  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));

  conformable(st.Grid(),A.Grid());
@ -292,15 +206,12 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());

-  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
-  DerivCommTime+=usecond();

  Atilde=A;
  int LLs = B.Grid()->_rdimensions[0];


-  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
@ -312,8 +223,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    // Call the single hop
    ////////////////////////

-    DerivDhopComputeTime -= usecond();
-
    int Usites = U.Grid()->oSites();

    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
@ -321,10 +230,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
-    DerivDhopComputeTime += usecond();
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
-  DerivComputeTime += usecond();
 }

 template<class Impl>
@ -382,14 +289,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
-  //  std::cout << GridLogDslash<<"Dhop internal"<<std::endl;
-  DhopTotalTime=-usecond();
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
    DhopInternalSerialComms(st,lo,U,in,out,dag);
-  DhopTotalTime+=usecond();
-  //  std::cout << GridLogDslash<<"Dhop took"<<DhopTotalTime<<std::endl;
 }


@ -398,6 +301,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
+  GRID_TRACE("DhopInternalOverlappedComms");
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
@ -406,59 +310,58 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
-  DhopFaceTime=-usecond();
-  st.HaloExchangeOptGather(in,compressor);
-  DhopFaceTime+=usecond();
-  //  std::cout << GridLogDslash<< " Dhop Gather end "<< DhopFaceTime<<" us " <<std::endl;
-
-  DhopCommTime =-usecond();
+  {
+    GRID_TRACE("Gather");
+    st.HaloExchangeOptGather(in,compressor);
+    accelerator_barrier();
+  }
+  
  std::vector<std::vector<CommsRequest_t> > requests;
+  auto id=traceStart("Communicate overlapped");
  st.CommunicateBegin(requests);

  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime=-usecond();
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  DhopFaceTime+=usecond();
-  //  std::cout << GridLogDslash<< " Dhop Commsmerge end "<<DhopFaceTime<< " us "<<std::endl;
+  {
+    GRID_TRACE("MergeSHM");
+    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  }
      
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
-  DhopComputeTime=-usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
+    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-  DhopComputeTime+=usecond();
-  //  std::cout << GridLogDslash<< " Dhop Compute 1 end "<< DhopComputeTime<<" us" <<std::endl;

  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
-  //  std::cout << GridLogDslash<< " Dhop Comunicate end "<< DhopCommTime << " us" <<std::endl;
+  traceStop(id);

  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
-  DhopFaceTime=-usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
-  //  std::cout << GridLogDslash<< " Dhop CommsMerge2 end "<<DhopFaceTime << " us "<<std::endl;
+  {
+    GRID_TRACE("Merge");
+    st.CommsMerge(compressor);
+  }
+  

-  DhopComputeTime2=-usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
+    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
-  DhopComputeTime2+=usecond();
-  //  std::cout << GridLogDslash<< " Dhop Ext end "<<DhopComputeTime2 <<"us  "<<std::endl;
 }


@ -468,32 +371,30 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
 						    const FermionField &in, 
 						    FermionField &out,int dag)
 {
+  GRID_TRACE("DhopInternalSerialComms");
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];

-  //  std::cout << GridLogDslash<< " Dhop Halo exchange begine " <<std::endl;
-  DhopCommTime=-usecond();
-  st.HaloExchangeOpt(in,compressor);
-  DhopCommTime+=usecond();
-  //  std::cout << GridLogDslash<< " Dhop Comms end "<<DhopCommTime<<" us"<<std::endl;
+  {
+    GRID_TRACE("HaloExchange");
+    st.HaloExchangeOpt(in,compressor);
+  }
  
-  DhopComputeTime=-usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
+    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
-  DhopComputeTime+=usecond();
-  //  std::cout << GridLogDslash<< " Dhop Compute end "<<DhopComputeTime<<" us" <<std::endl;
 }


 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -505,7 +406,6 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -517,7 +417,6 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());

@ -572,12 +471,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  LatComplex    sk(_grid);  sk = Zero();
  LatComplex    sk2(_grid); sk2= Zero();
  LatComplex    W(_grid); W= Zero();
-  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
  LatComplex 	cosha(_grid);
  LatComplex 	kmu(_grid);
  LatComplex 	Wea(_grid);
  LatComplex 	Wema(_grid);
+  LatComplex 	ea(_grid);
+  LatComplex 	ema(_grid);
+  LatComplex 	eaLs(_grid);
+  LatComplex 	emaLs(_grid);
+  LatComplex 	ea2Ls(_grid);
+  LatComplex 	ema2Ls(_grid);
  LatComplex 	sinha(_grid);
  LatComplex 	sinhaLs(_grid);
  LatComplex 	coshaLs(_grid);
@ -612,39 +516,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  ////////////////////////////////////////////
  cosha = (one + W*W + sk) / (abs(W)*2.0);

-  // FIXME Need a Lattice acosh
-
-  {
-    autoView(cosha_v,cosha,CpuRead);
-    autoView(a_v,a,CpuWrite);
-    for(int idx=0;idx<_grid->lSites();idx++){
-      Coordinate lcoor(Nd);
-      Tcomplex cc;
-      //    RealD sgn;
-      _grid->LocalIndexToLocalCoor(idx,lcoor);
-      peekLocalSite(cc,cosha_v,lcoor);
-      assert((double)real(cc)>=1.0);
-      assert(fabs((double)imag(cc))<=1.0e-15);
-      cc = ScalComplex(::acosh(real(cc)),0.0);
-      pokeLocalSite(cc,a_v,lcoor);
-    }
-  }
-
-  Wea = ( exp( a) * abs(W)  );
-  Wema= ( exp(-a) * abs(W)  );
-  sinha = 0.5*(exp( a) - exp(-a));
-  sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
-  coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
+  ea = (cosha + sqrt(cosha*cosha-one));
+  ema= (cosha - sqrt(cosha*cosha-one));
+  eaLs = pow(ea,Ls);
+  emaLs= pow(ema,Ls);
+  ea2Ls = pow(ea,2.0*Ls);
+  ema2Ls= pow(ema,2.0*Ls);
+  Wea= abs(W) * ea;
+  Wema= abs(W) * ema;
+  //  a=log(ea);
+  
+  sinha = 0.5*(ea - ema);
+  sinhaLs = 0.5*(eaLs-emaLs);
+  coshaLs = 0.5*(eaLs+emaLs);

  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
-  F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
-  F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
+  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
+  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
  F = F - abs(W) * sinha * 4.0 * mass;

-  Bpp =  (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
-  Bmm =  (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
-  App =  (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
-  Amm =  (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
+  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
+  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
+  App =  (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one);
+  Amm =  (A/F) * (one - ea2Ls)  * ea  * (ea  - abs(W)) * (one - mass*mass * one);
  ABpm = (A/F) * abs(W) * sinha * 2.0  * (one + mass * coshaLs * 2.0 + mass*mass * one);

  //P+ source, P- source
@ -667,29 +561,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
      buf1_4d = Zero();
      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
      //G(s,t)
-      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
+      bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d;
      //A++*exp(a(s+t))
-      bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A+-*exp(a(s-t))
-      bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ;
      //A-+*exp(a(-s+t))
-      bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A--*exp(a(-s-t))
-      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ;

      //GL
      buf2_4d = Zero();
      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
      //G(s,t)
-      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
+      bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d;
      //B++*exp(a(s+t))
-      bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B+-*exp(a(s-t))
-      bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ;
      //B-+*exp(a(-s+t))
-      bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B--*exp(a(-s-t))
-      bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ;
    }
    InsertSlice(bufR_4d, GR, (ss-1), 0);
    InsertSlice(bufL_4d, GL, (ss-1), 0);
@ -808,28 +702,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
  W = one - M5 + sk2;

  ////////////////////////////////////////////
-  // Cosh alpha -> alpha
+  // Cosh alpha -> exp(+/- alpha)
  ////////////////////////////////////////////
  cosha =  (one + W*W + sk) / (abs(W)*2.0);

-  // FIXME Need a Lattice acosh
-  {
-  autoView(cosha_v,cosha,CpuRead);
-  autoView(a_v,a,CpuWrite);
-  for(int idx=0;idx<_grid->lSites();idx++){
-    Coordinate lcoor(Nd);
-    Tcomplex cc;
-    //    RealD sgn;
-    _grid->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(cc,cosha_v,lcoor);
-    assert((double)real(cc)>=1.0);
-    assert(fabs((double)imag(cc))<=1.0e-15);
-    cc = ScalComplex(::acosh(real(cc)),0.0);
-    pokeLocalSite(cc,a_v,lcoor);
-  }}
-  
-  Wea = ( exp( a) * abs(W)  );
-  Wema= ( exp(-a) * abs(W)  );
+  Wea = abs(W)*(cosha + sqrt(cosha*cosha-one));
+  Wema= abs(W)*(cosha - sqrt(cosha*cosha-one));
  
  num   = num + ( one - Wema ) * mass * in;
  denom= ( Wea - one ) + mass*mass * (one - Wema); 
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -76,91 +76,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  StencilOdd.BuildSurfaceList(1,vol4);
 }

-template<class Impl>
-void WilsonFermion<Impl>::Report(void)
-{
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-  RealD volume = 1;
-  Coordinate latt = _grid->GlobalDimensions();
-  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-
-  if ( DhopCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls   : " << DhopCalls   << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
-
-    // Average the compute time
-    _grid->GlobalSum(DhopComputeTime);
-    DhopComputeTime/=NP;
-    RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-
-    RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-   }
-
-  if ( DerivCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
-    std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls    : " <<DerivCalls <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
-
-    // how to count flops here?
-    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
-    std::cout << GridLogMessage << "Average mflops/s per call               ? : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node      ? : " << mflops/NP << std::endl;
-
-    // how to count flops here?
-    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)        ? : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl;  }
-
-  if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion Stencil"    <<std::endl;  Stencil.Report();
-    std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-  }
-  if ( DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
-  }
-}
-
-template<class Impl>
-void WilsonFermion<Impl>::ZeroCounters(void) {
-  DhopCalls       = 0; // ok
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopComputeTime2= 0;
-  DhopFaceTime    = 0;
-  DhopTotalTime   = 0;
-
-  DerivCalls       = 0; // ok
-  DerivCommTime    = 0;
-  DerivComputeTime = 0;
-  DerivDhopComputeTime = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-  Stencil.ZeroCountersi();
-  StencilEven.ZeroCountersi();
-  StencilOdd.ZeroCountersi();
-}
-
-
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@ -320,7 +235,6 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
-  DerivCalls++;
  assert((dag == DaggerNo) || (dag == DaggerYes));

  Compressor compressor(dag);
@ -329,11 +243,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Atilde(B.Grid());
  Atilde = A;

-  DerivCommTime-=usecond();
  st.HaloExchange(B, compressor);
-  DerivCommTime+=usecond();

-  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
@ -341,7 +252,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;

-    DerivDhopComputeTime -= usecond();
    int Ls=1;
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);

@ -349,9 +259,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
-    DerivDhopComputeTime += usecond();
  }
-  DerivComputeTime += usecond();
 }

 template <class Impl>
@ -398,7 +306,6 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
-  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@ -410,7 +317,6 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -423,7 +329,6 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -488,14 +393,12 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
-  DhopTotalTime-=usecond();
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
 #endif
    DhopInternalSerial(st,lo,U,in,out,dag);
-  DhopTotalTime+=usecond();
 }

 template <class Impl>
@ -504,6 +407,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag)
 {
+  GRID_TRACE("DhopOverlapped");
  assert((dag == DaggerNo) || (dag == DaggerYes));

  Compressor compressor(dag);
@ -514,53 +418,55 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  /////////////////////////////
  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
-  DhopFaceTime-=usecond();
-  st.HaloGather(in,compressor);
-  DhopFaceTime+=usecond();
+  {
+    GRID_TRACE("Gather");
+    st.HaloGather(in,compressor);
+  }

-  DhopCommTime -=usecond();
+  tracePush("Communication");
  st.CommunicateBegin(requests);

  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
-  st.CommsMergeSHM(compressor);
-  DhopFaceTime+=usecond();
+  {
+    GRID_TRACE("MergeSHM");
+    st.CommsMergeSHM(compressor);
+  }

  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
-  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
+    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
-  DhopComputeTime+=usecond();

  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
-
-  DhopFaceTime-=usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
+  tracePop("Communication");

+  {
+    GRID_TRACE("Merge");
+    st.CommsMerge(compressor);
+  }
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////

-  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  } else {
+    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
-  DhopComputeTime2+=usecond();
 };


@ -570,20 +476,22 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
+  GRID_TRACE("DhopSerial");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
-  DhopCommTime-=usecond();
-  st.HaloExchange(in, compressor);
-  DhopCommTime+=usecond();
+  {
+    GRID_TRACE("HaloExchange");
+    st.HaloExchange(in, compressor);
+  }

-  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
+    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
-  DhopComputeTime+=usecond();
 };
 /*Change ends */

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -72,20 +72,15 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  if (SE->_is_local) {						\
    int perm= SE->_permute;					\
    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
-    spProj(chi,tmp);						\
-  } else if ( st.same_node[Dir] ) {				\
-    chi = coalescedRead(buf[SE->_offset],lane);			\
-  }								\
-  acceleratorSynchronise();						\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
-    Recon(result, Uchi);					\
-  }								\
+    spProj(chi,tmp);							\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
+    Recon(result, Uchi);						\
+  }									\
  acceleratorSynchronise();

 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+  if (!SE->_is_local ) {		\
    auto chi = coalescedRead(buf[SE->_offset],lane);		\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
--- a/Grid/qcd/action/filters/DDHMCFilter.h
+++ b/Grid/qcd/action/filters/DDHMCFilter.h
@ -91,6 +91,19 @@ struct DDHMCFilter: public MomentumFilterBase<GaugeField>
 	  U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
+	if ( Width==4) { 
+	  U    = where(mod(coor,B1)==Integer(B1-4),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-3),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(2)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(3)   ,zzz,U); 
+	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	  U_mu = where(mod(coor,B1)==Integer(B1-5),zzz_mu,U_mu); 
+	  PokeIndex<LorentzIndex>(U, U_mu, mu);
+	}
      }

    }
--- a/Grid/qcd/action/filters/MomentumFilter.h
+++ b/Grid/qcd/action/filters/MomentumFilter.h
@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct MomentumFilterBase{
  virtual void applyFilter(MomentaField &P) const = 0;
+  virtual ~MomentumFilterBase(){};
 };

 //Do nothing
@ -83,7 +84,6 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
    
  }

-
 };


--- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@ -85,7 +85,12 @@ NAMESPACE_BEGIN(Grid);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };

-      virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
+      virtual std::string action_name(){
+	std::stringstream sstream;
+	sstream<<"OneFlavourRatioRationalPseudoFermionAction("
+	       <<DenOp.Mass()<<") / det("<<NumOp.Mass()<<")";
+	return sstream.str();
+      }
      
      virtual std::string LogParameters(){
 	std::stringstream sstream;
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -143,9 +143,10 @@ protected:
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();

+      //      DumpSliceNorm("force ",force,Nd-1);
      MomFilter->applyFilter(force);
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl;
-      DumpSliceNorm("force ",force,Nd-1);
+      DumpSliceNorm("force filtered ",force,Nd-1);
      
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
@ -286,7 +287,7 @@ public:
 		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
 		  <<" Fdt max  "  << as[level].actions.at(actionID)->Fdt_max_average()
-		  <<" norm "      << as[level].actions.at(actionID)->Fdt_norm_average()
+		  <<" Fdt norm "  << as[level].actions.at(actionID)->Fdt_norm_average()
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
--- a/Grid/qcd/observables/topological_charge.h
+++ b/Grid/qcd/observables/topological_charge.h
@ -99,7 +99,7 @@ public:
 	// using wilson flow by default here
 	WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 	WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
-	Real T0   = WF.energyDensityPlaquette(Usmear);
+	Real T0   = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
 	std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 		  << "T0                : [ " << traj << " ] "<< T0 << std::endl;
      }
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@ -7,6 +7,7 @@ Source file: ./lib/qcd/modules/plaquette.h
 Copyright (C) 2017

 Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -33,28 +34,44 @@ NAMESPACE_BEGIN(Grid);

 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
+public:
+  //Store generic measurements to take during smearing process using std::function
+  typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
+  
+private:
  unsigned int Nstep;
-  unsigned int measure_interval;
-  mutable RealD epsilon, taus;
-
+  RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step
+ 
+  std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency

  mutable WilsonGaugeAction<Gimpl> SG;

-  void evolve_step(typename Gimpl::GaugeField&) const;
-  void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
-  RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
+  //Evolve the gauge field by 1 step and update tau
+  void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
+  //Evolve the gauge field by 1 step and update tau and the current time step eps
+  void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const;

 public:
  INHERIT_GIMPL_TYPES(Gimpl)

+  void resetActions(){ functions.clear(); }
+
+  void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
+
+  //Set the class to perform the default measurements: 
+  //the plaquette energy density every step
+  //the plaquette topological charge every 'topq_meas_interval' steps
+  //and output to stdout
+  void setDefaultMeasurements(int topq_meas_interval = 1);
+
  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
  Nstep(Nstep),
    epsilon(epsilon),
-    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
    assert(epsilon > 0.0);
    LogMessage();
+    setDefaultMeasurements(interval);
  }

  void LogMessage() {
@ -73,9 +90,29 @@ public:
    // undefined for WilsonFlow
  }

-  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
-  RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
-  RealD energyDensityPlaquette(const GaugeField& U) const;
+  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const;
+
+  //Compute t^2 <E(t)> for time t from the plaquette
+  static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
+
+  //Compute t^2 <E(t)> for time t from the 1x1 cloverleaf form
+  //t is the Wilson flow time
+  static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U);
+  
+  //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps
+  //The smeared field is output as V
+  std::vector<RealD> flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1);
+
+  //Version that does not return the smeared field
+  std::vector<RealD> flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1);
+
+
+  //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps
+  //The smeared field is output as V
+  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1);
+
+  //Version that does not return the smeared field
+  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
 };


@ -83,7 +120,7 @@ public:
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
+void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
  GaugeField Z(U.Grid());
  GaugeField tmp(U.Grid());
  SG.deriv(U, Z);
@ -99,12 +136,13 @@ void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  tau += epsilon;
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
-  if (maxTau - taus < epsilon){
-    epsilon = maxTau-taus;
+void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{
+  if (maxTau - tau < eps){
+    eps = maxTau-tau;
  }
  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
@ -114,95 +152,151 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
+  Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0

  Z *= -17.0/8.0;
  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
+  Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
    

  Z *= -4.0/3.0;
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2

  // Ramos 
-  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
+  Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
  RealD diff = norm2(diffU);
  // adjust integration step
    
-  taus += epsilon;
+  tau += eps;
  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    
-  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
+  eps = eps*0.95*std::pow(1e-4/diff,1./3.);
  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;

 }

+
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
-  RealD td = tau(step);
-  return 2.0 * td * td * SG.S(U)/U.Grid()->gSites();
+RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
+  static WilsonGaugeAction<Gimpl> SG(3.0);
+  return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
+}
+
+//Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
+template <class Gimpl>
+RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  assert(Nd == 4);
+  //E = 1/2 tr( F_munu F_munu )
+  //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
+  //F_01 F_02 F_03   F_12 F_13  F_23
+  GaugeMat F(U.Grid());
+  LatticeComplexD R(U.Grid());
+  R = Zero();
+  
+  for(int mu=0;mu<3;mu++){
+    for(int nu=mu+1;nu<4;nu++){
+      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
+      R = R + trace(F*F);
+    }
+  }
+  ComplexD out = sum(R);
+  out = t*t*out / RealD(U.Grid()->gSites());
+  return -real(out); //minus sign necessary for +ve energy
+}
+
+
+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
+  std::vector<RealD> out;
+  resetActions();
+  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
+      std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl;
+      out.push_back( energyDensityPlaquette(t,U) );
+    });      
+  smear(V,U);
+  return out;
 }

 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
-  return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites();
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
+  GaugeField V(U);
+  return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
 }

+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
+  std::vector<RealD> out;
+  resetActions();
+  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
+      std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl;
+      out.push_back( energyDensityCloverleaf(t,U) );
+    });      
+  smear(V,U);
+  return out;
+}
+
+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
+  GaugeField V(U);
+  return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
+}
+
+

 //#define WF_TIMING 
-
-
-
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
+void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
  out = in;
-  for (unsigned int step = 1; step <= Nstep; step++) {
+  RealD taus = 0.;
+  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
-    evolve_step(out);
+    evolve_step(out, taus);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
 #ifdef WF_TIMING
    std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << tau(step) << "  " 
-	      << energyDensityPlaquette(step,out) << std::endl;
-    if( step % measure_interval == 0){
-      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-		<< step << "  " 
-		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-    }
+    //Perform measurements
+    for(auto const &meas : functions)
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
  }
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
+void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{
  out = in;
-  taus = epsilon;
+  RealD taus = 0.;
+  RealD eps = epsilon;
  unsigned int step = 0;
  do{
    step++;
    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
-    evolve_step_adaptive(out, maxTau);
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << taus << "  "
-	      << energyDensityPlaquette(out) << std::endl;
-    if( step % measure_interval == 0){
-      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-		<< step << "  " 
-		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-    }
+    evolve_step_adaptive(out, taus, eps, maxTau);
+    //Perform measurements
+    for(auto const &meas : functions)
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
  } while (taus < maxTau);
-
-
-
 }

+template <class Gimpl>
+void WilsonFlow<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
+  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
+    });
+  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
+    });
+}
+
+
 NAMESPACE_END(Grid);

--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -1251,10 +1251,6 @@ public:
    return 0;
  }

-  void ZeroCounters(void) { };
-
-  void Report(void) {   };
-
 };
 NAMESPACE_END(Grid);

--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@ -208,5 +208,46 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 }


+
+//////////////////////////////////////////////////////////////////////////////////
+//Copy a single lane of a SIMD tensor type from one object to another
+//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
+///////////////////////////////////////////////////////////////////////////////////
+template<class vobjOut, class vobjIn>
+accelerator_inline 
+void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
+{
+  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  typedef typename vobjOut::vector_type ovector_type;  
+  typedef typename vobjIn::vector_type ivector_type;  
+  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
+  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
+  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
+
+  typedef typename vobjOut::scalar_type oscalar_type;  
+  typedef typename vobjIn::scalar_type iscalar_type;  
+  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
+  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
+
+  typedef oextract_type * opointer;
+  typedef iextract_type * ipointer;
+
+  constexpr int oNsimd=ovector_type::Nsimd();
+  constexpr int iNsimd=ivector_type::Nsimd();
+
+  iscalar_type itmp;
+  oscalar_type otmp;
+
+  opointer __restrict__  op = (opointer)&vecOut;
+  ipointer __restrict__  ip = (ipointer)&vecIn;
+  for(int w=0;w<owords;w++){
+    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
+    otmp = itmp; //potential precision change
+    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
+  }
+}
+
+
 NAMESPACE_END(Grid);

--- a/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
@ -0,0 +1,918 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//Production binary for the 40ID G-parity ensemble
+
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+struct EOFAparameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
+				  OneFlavourRationalParams, rat_params,
+				  double, action_tolerance,
+				  double, action_mixcg_inner_tolerance,
+				  double, md_tolerance,
+				  double, md_mixcg_inner_tolerance);
+
+  EOFAparameters() { 
+    action_mixcg_inner_tolerance = 1e-8;
+    action_tolerance = 1e-10;
+    md_tolerance = 1e-8;
+    md_mixcg_inner_tolerance = 1e-8;
+
+    rat_params.lo = 1.0;
+    rat_params.hi = 25.0;
+    rat_params.MaxIter  = 50000;
+    rat_params.tolerance= 1.0e-9;
+    rat_params.degree   = 14;
+    rat_params.precision= 50;
+  }
+};
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+				  RealD, TrajectoryLength,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  std::vector<EOFAparameters>, eofa_l,
+				  RatQuoParameters, rat_quo_s,
+				  RatQuoParameters, rat_quo_DSDR);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+    TrajectoryLength = 1.0;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 50000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+//action_or_md toggles checking the action (0), MD (1) or both (2) setups
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr, int action_or_md){
+  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+
+  PowerMethod<FermionFieldD> power_method;
+  RealD lambda_max;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
+
+  lambda_max = power_method(MdagM,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
+  lambda_max = power_method(VdagV,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  if(action_or_md == 0 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  if(action_or_md == 1 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+}
+
+
+template<typename FermionImplPolicy>
+void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  RealD scale = std::sqrt(0.5);
+  gaussian(rng,eta); eta = eta * scale;
+
+  //Use the inbuilt check
+  EOFA.refresh(latt, eta);
+  EOFA.S(latt);
+  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
+}
+
+
+template<typename FermionImplPolicy>
+class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
+  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
+}
+
+//Applications of M^{-1} cost the same as M for EOFA!
+template<typename FermionImplPolicy>
+class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
+  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
+}
+
+
+NAMESPACE_BEGIN(Grid);
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      MPCG.InnerTolerance = InnerTolerance;
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD Tolerance;
+    Integer MaxIterations;
+
+    RealD Delta; //reliable update parameter
+
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+    
+    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
+								  RealD delta,
+								  Integer maxit, 
+								  GridBase* _sp_grid4, 
+								  GridBase* _sp_grid5, 
+								  FermionOperatorF &_FermOpF,
+								  FermionOperatorD &_FermOpD,
+								  SchurOperatorF   &_LinOpF,
+								  SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      Delta(delta),
+      MaxIterations(maxit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5)
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+NAMESPACE_END(Grid);
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+
+  std::string serial_seeds = "1 2 3 4 5";
+  std::string parallel_seeds = "6 7 8 9 10";
+
+  int i=1;
+  while(i < argc){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+      i+=2;
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+      i++;
+    }else if(sarg == "--set_seeds"){ //set the rng seeds. Expects two vector args, e.g.  --set_seeds 1.2.3.4 5.6.7.8
+      assert(i < argc-2);
+      std::vector<int> tmp;
+      GridCmdOptionIntVector(argv[i+1],tmp);
+      {
+	std::stringstream ss;
+	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
+	ss << tmp.back();
+	serial_seeds = ss.str();
+      }
+      GridCmdOptionIntVector(argv[i+2],tmp);
+      {
+	std::stringstream ss;
+	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
+	ss << tmp.back();
+	parallel_seeds = ss.str();
+      }
+      i+=3;
+      std::cout << GridLogMessage << "Set serial seeds to " << serial_seeds << std::endl;
+      std::cout << GridLogMessage << "Set parallel seeds to " << parallel_seeds << std::endl;
+      
+    }else{
+      i++;
+    }
+  }
+
+  
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    {
+      Grid::XmlWriter wr(param_file + ".templ");
+      write(wr, "Params", user_params);
+    }
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+
+  typedef GparityMobiusEOFAFermionD EOFAactionD;
+  typedef GparityMobiusFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityMobiusEOFAFermionF EOFAactionF;
+  typedef GparityMobiusFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  MD.name    = std::string("MinimumNorm2");
+
+  // typedef ConjugateHMCRunnerD<ForceGradient> HMCWrapper;
+  // MD.name    = std::string("ForceGradient");
+  
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = user_params.TrajectoryLength;
+
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = serial_seeds;
+  RNGpar.parallel_seeds = parallel_seeds;
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+  //aiming for ainv=1.723 GeV
+  //                                  me         bob
+  //Estimated  a(ml+mres) [40ID] = 0.001305    0.00131
+  //           a(mh+mres) [40ID] = 0.035910    0.03529
+  //Estimate Ls=12, b+c=2  mres~0.0011
+
+  //1/24/2022 initial mres measurement gives mres=0.001,  adjusted light quark mass to 0.0003 from 0.0001
+  
+  const int Ls      = 12;
+  Real beta         = 1.848;
+  Real light_mass   = 0.0003;
+  Real strange_mass = 0.0342;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD mobius_scale = 2.; //b+c
+
+  RealD mob_bmc = 1.0;
+  RealD mob_b = (mobius_scale + mob_bmc)/2.;
+  RealD mob_c = (mobius_scale - mob_bmc)/2.;
+
+  std::cout << GridLogMessage
+	    << "Ensemble parameters:" << std::endl
+	    << "Ls=" << Ls << std::endl
+	    << "beta=" << beta << std::endl
+	    << "light_mass=" << light_mass << std::endl
+	    << "strange_mass=" << strange_mass << std::endl
+	    << "mobius_scale=" << mobius_scale << std::endl;
+  
+  //Setup the Grids
+  auto UGridD   = TheHMC.Resources.GetCartesian();
+  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(UGridD);
+  LatticeGaugeFieldF Uf(UGridF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
+  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
+
+
+  /////////////////////////////////////////////////////////////
+  // Light EOFA action
+  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
+  /////////////////////////////////////////////////////////////
+  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
+  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
+  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
+  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
+
+
+  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
+  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
+  int n_light_hsb = 5;
+  assert(user_params.eofa_l.size() == n_light_hsb);
+  
+  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
+
+  for(int i=0;i<n_light_hsb;i++){
+    RealD iml = eofa_light_masses[i];
+    RealD ipv = eofa_pv_masses[i];
+
+    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+
+    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
+    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
+    
+    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
+    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
+
+#if 1
+    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
+    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+#else
+    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
+    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
+#endif
+
+    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
+							*LopD, *RopD, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*DerivMCG_L, *DerivMCG_R, 
+							user_params.eofa_l[i].rat_params, true);
+    EOFA_pfactions[i] = EOFA;
+    Level1.push_back(EOFA);
+  }
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 50000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+  ///////////////////////////////////
+  // DSDR action
+  ///////////////////////////////////
+  RealD dsdr_mass=-1.8;   
+  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
+  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
+  RealD dsdr_epsilon_b = 0.5; 
+  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
+  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
+
+  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
+  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
+ 
+  RationalActionParams rat_act_params_DSDR;
+  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_DSDR.precision= 60;
+  rat_act_params_DSDR.MaxIter  = 50000;
+  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
+  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
+  Level2.push_back(&Quotient_DSDR);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool 
+    tune_rhmc_s=false, eigenrange_s=false, 
+    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
+    check_eofa=false, 
+    upper_bound_eofa=false, lower_bound_eofa(false);
+
+  std::string lanc_params_s;
+  std::string lanc_params_DSDR;
+  int tune_rhmc_s_action_or_md;
+  int tune_rhmc_DSDR_action_or_md;
+  int eofa_which_hsb;
+
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_s"){
+      assert(i < argc-1);
+      tune_rhmc_s=true;
+      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+    else if(sarg == "--tune_rhmc_DSDR"){
+      assert(i < argc-1);
+      tune_rhmc_DSDR=true;
+      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_DSDR"){
+      assert(i < argc-1);
+      eigenrange_DSDR=true;
+      lanc_params_DSDR = argv[i+1];
+    }
+    else if(sarg == "--check_eofa"){
+      assert(i < argc-1);
+      check_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
+      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+    }
+    else if(sarg == "--upper_bound_eofa"){
+      assert(i < argc-1);
+      upper_bound_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+    else if(sarg == "--lower_bound_eofa"){
+      assert(i < argc-1);
+      lower_bound_eofa = true;      
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+  }
+  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
+    std::cout << GridLogMessage << "Running checks" << std::endl;
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+
+    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+    if(check_eofa){
+      if(eofa_which_hsb >= 0){
+	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+      }else{
+	for(int i=0;i<n_light_hsb;i++){
+	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
+	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
+	}
+      }
+    }	  
+    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
+    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
+
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
@ -0,0 +1,873 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//Production binary for the 40ID G-parity ensemble
+
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+struct EOFAparameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
+				  OneFlavourRationalParams, rat_params,
+				  double, action_tolerance,
+				  double, action_mixcg_inner_tolerance,
+				  double, md_tolerance,
+				  double, md_mixcg_inner_tolerance);
+
+  EOFAparameters() { 
+    action_mixcg_inner_tolerance = 1e-8;
+    action_tolerance = 1e-10;
+    md_tolerance = 1e-8;
+    md_mixcg_inner_tolerance = 1e-8;
+
+    rat_params.lo = 1.0;
+    rat_params.hi = 25.0;
+    rat_params.MaxIter  = 10000;
+    rat_params.tolerance= 1.0e-9;
+    rat_params.degree   = 14;
+    rat_params.precision= 50;
+  }
+};
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+				  RealD, TrajectoryLength,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  std::vector<EOFAparameters>, eofa_l,
+				  RatQuoParameters, rat_quo_s,
+				  RatQuoParameters, rat_quo_DSDR);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+    TrajectoryLength = 1.0;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+//action_or_md toggles checking the action (0), MD (1) or both (2) setups
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr, int action_or_md){
+  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+
+  PowerMethod<FermionFieldD> power_method;
+  RealD lambda_max;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
+
+  lambda_max = power_method(MdagM,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
+  lambda_max = power_method(VdagV,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  if(action_or_md == 0 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  if(action_or_md == 1 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+}
+
+
+template<typename FermionImplPolicy>
+void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  RealD scale = std::sqrt(0.5);
+  gaussian(rng,eta); eta = eta * scale;
+
+  //Use the inbuilt check
+  EOFA.refresh(latt, eta);
+  EOFA.S(latt);
+  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
+}
+
+
+template<typename FermionImplPolicy>
+class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
+  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
+}
+
+//Applications of M^{-1} cost the same as M for EOFA!
+template<typename FermionImplPolicy>
+class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
+  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
+}
+
+
+NAMESPACE_BEGIN(Grid);
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      MPCG.InnerTolerance = InnerTolerance;
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD Tolerance;
+    Integer MaxIterations;
+
+    RealD Delta; //reliable update parameter
+
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+    
+    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
+								  RealD delta,
+								  Integer maxit, 
+								  GridBase* _sp_grid4, 
+								  GridBase* _sp_grid5, 
+								  FermionOperatorF &_FermOpF,
+								  FermionOperatorD &_FermOpD,
+								  SchurOperatorF   &_LinOpF,
+								  SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      Delta(delta),
+      MaxIterations(maxit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5)
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+NAMESPACE_END(Grid);
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+    }
+  }
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    {
+      Grid::XmlWriter wr(param_file + ".templ");
+      write(wr, "Params", user_params);
+    }
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+
+  typedef GparityMobiusEOFAFermionD EOFAactionD;
+  typedef GparityMobiusFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityMobiusEOFAFermionF EOFAactionF;
+  typedef GparityMobiusFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = user_params.TrajectoryLength;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  //aiming for ainv=2.068             me          Bob
+  //Estimated  a(ml+mres) [48ID] = 0.001048    0.00104 
+  //           a(mh+mres) [48ID] = 0.028847    0.02805
+  //Estimate Ls=12, b+c=2  mres~0.0003
+
+  const int Ls      = 12;
+  Real beta         = 1.946;
+  Real light_mass   = 0.00074;   //0.00104 - mres_approx;
+  Real strange_mass = 0.02775;    //0.02805 - mres_approx
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD mobius_scale = 2.; //b+c
+
+  RealD mob_bmc = 1.0;
+  RealD mob_b = (mobius_scale + mob_bmc)/2.;
+  RealD mob_c = (mobius_scale - mob_bmc)/2.;
+
+  //Setup the Grids
+  auto UGridD   = TheHMC.Resources.GetCartesian();
+  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(UGridD);
+  LatticeGaugeFieldF Uf(UGridF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
+  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
+
+
+  /////////////////////////////////////////////////////////////
+  // Light EOFA action
+  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
+  /////////////////////////////////////////////////////////////
+  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
+  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
+  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
+  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
+  
+  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
+  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
+  int n_light_hsb = 5;
+  assert(user_params.eofa_l.size() == n_light_hsb);
+  
+  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
+
+  for(int i=0;i<n_light_hsb;i++){
+    RealD iml = eofa_light_masses[i];
+    RealD ipv = eofa_pv_masses[i];
+
+    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+
+    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
+    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
+    
+    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
+    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
+
+#if 1
+    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
+    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+#else
+    
+    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
+    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
+#endif
+
+    
+    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
+							*LopD, *RopD, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*DerivMCG_L, *DerivMCG_R, 
+							user_params.eofa_l[i].rat_params, true);
+    EOFA_pfactions[i] = EOFA;
+    Level1.push_back(EOFA);
+  }
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+  ///////////////////////////////////
+  // DSDR action
+  ///////////////////////////////////
+  RealD dsdr_mass=-1.8;   
+  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
+  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
+  RealD dsdr_epsilon_b = 0.5; 
+  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
+  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
+
+  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
+  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
+ 
+  RationalActionParams rat_act_params_DSDR;
+  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_DSDR.precision= 60;
+  rat_act_params_DSDR.MaxIter  = 10000;
+  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
+  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
+  Level2.push_back(&Quotient_DSDR);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool 
+    tune_rhmc_s=false, eigenrange_s=false, 
+    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
+    check_eofa=false, 
+    upper_bound_eofa=false, lower_bound_eofa(false);
+
+  std::string lanc_params_s;
+  std::string lanc_params_DSDR;
+  int tune_rhmc_s_action_or_md;
+  int tune_rhmc_DSDR_action_or_md;
+  int eofa_which_hsb;
+
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_s"){
+      assert(i < argc-1);
+      tune_rhmc_s=true;
+      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+    else if(sarg == "--tune_rhmc_DSDR"){
+      assert(i < argc-1);
+      tune_rhmc_DSDR=true;
+      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_DSDR"){
+      assert(i < argc-1);
+      eigenrange_DSDR=true;
+      lanc_params_DSDR = argv[i+1];
+    }
+    else if(sarg == "--check_eofa"){
+      assert(i < argc-1);
+      check_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
+      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+    }
+    else if(sarg == "--upper_bound_eofa"){
+      assert(i < argc-1);
+      upper_bound_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+    else if(sarg == "--lower_bound_eofa"){
+      assert(i < argc-1);
+      lower_bound_eofa = true;      
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+  }
+  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
+    std::cout << GridLogMessage << "Running checks" << std::endl;
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+
+    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+
+    if(check_eofa){
+      if(eofa_which_hsb >= 0){
+	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+      }else{
+	for(int i=0;i<n_light_hsb;i++){
+	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
+	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
+	}
+      }
+    }	  
+    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
+    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
+
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@ -420,7 +420,6 @@ public:
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-	Dw.ZeroCounters();

 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
@ -589,7 +588,6 @@ public:
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-	Ds.ZeroCounters();

 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -186,7 +186,6 @@ int main (int argc, char ** argv)

  if (1) {
    FGrid->Barrier();
-    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
@ -231,7 +230,6 @@ int main (int argc, char ** argv)
      exit(-1);
    }
    assert (norm2(err)< 1.0e-4 );
-    Dw.Report();
  }

  if (1)
@ -306,7 +304,6 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
-    Dw.ZeroCounters();
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
@ -328,7 +325,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
-    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@ -93,7 +93,6 @@ int main (int argc, char ** argv)
  int ncall =1000;
  if (1) {
    FGrid->Barrier();
-    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
@ -112,7 +111,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    Dw.Report();
  }


@ -134,7 +132,6 @@ int main (int argc, char ** argv)
  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
  if (1) {
    FGrid_d->Barrier();
-    DwD.ZeroCounters();
    DwD.Dhop(src_d,result_d,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
@ -153,7 +150,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    DwD.Report();
  }
 #endif
  Grid_finalize();
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -103,35 +103,30 @@ int main (int argc, char ** argv)
 #define BENCH_DW(A,...)			\
    Dw. A (__VA_ARGS__);				\
    FGrid->Barrier();				\
-    Dw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      Dw. A (__VA_ARGS__);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

 #define BENCH_ZDW(A,in,out)			\
    zDw. A (in,out);				\
    FGrid->Barrier();				\
-    zDw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      zDw. A (in,out);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    zDw.CayleyReport();							\
    std::cout<<GridLogMessage << "Called ZDw " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

 #define BENCH_DW_SSC(A,in,out)			\
    Dw. A (in,out);				\
    FGrid->Barrier();				\
-    Dw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      __SSC_START ;				\
@ -140,7 +135,6 @@ int main (int argc, char ** argv)
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@ -155,7 +155,6 @@ int main (int argc, char ** argv)
  //int ncall=1;

  // Counters
-  Dw.ZeroCounters();
  Grid.Barrier();

  double t0=usecond();
@ -201,7 +200,6 @@ int main (int argc, char ** argv)
  err = ref-result;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

-  Dw.Report();
  
  // guard
  double err0 = norm2(err);
--- a/configure.ac
+++ b/configure.ac
@ -128,6 +128,26 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### tracing
+AC_ARG_ENABLE([tracing],
+    [AC_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer], [enable tracing])],
+    [ac_TRACING=${enable_tracing}], [ac_TRACING=none])
+
+case ${ac_TRACING} in
+    nvtx)
+        AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
+	LIBS="${LIBS} -lnvToolsExt64_1"
+	;;
+    roctx)
+        AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
+	LIBS="${LIBS} -lroctx64"
+	;;
+    timer)
+        AC_DEFINE([GRID_TRACING_TIMER],[1],[use TIMER]);;
+    *)
+	AC_DEFINE([GRID_TRACING_NONE],[1],[no tracing]);;
+esac
+
 ############### fermions
 AC_ARG_ENABLE([fermion-reps],
     [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])],
--- a/examples/Example_christoph.cc
+++ b/examples/Example_christoph.cc
@ -0,0 +1,436 @@
+/*
+ * Warning: This code illustrative only: not well tested, and not meant for production use
+ * without regression / tests being applied
+ */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+RealD LLscale =1.0;
+RealD LCscale =1.0;
+
+template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  GridBase *grid;
+  GaugeField U;
+  
+  CovariantLaplacianCshift(GaugeField &_U)    :
+    grid(_U.Grid()),
+    U(_U) {  };
+
+  virtual GridBase *Grid(void) { return grid; };
+
+  virtual void  M    (const Field &in, Field &out)
+  {
+    out=Zero();
+    for(int mu=0;mu<Nd-1;mu++) {
+      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
+      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
+      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
+      out = out + 2.0*in;
+    }
+  };
+  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
+  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
+};
+
+void MakePhase(Coordinate mom,LatticeComplex &phase)
+{
+  GridBase *grid = phase.Grid();
+  auto latt_size = grid->GlobalDimensions();
+  ComplexD ci(0.0,1.0);
+  phase=Zero();
+
+  LatticeComplex coor(phase.Grid());
+  for(int mu=0;mu<Nd;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    phase = phase + (TwoPiL * mom[mu]) * coor;
+  }
+  phase = exp(phase*ci);
+}
+
+void PointSource(Coordinate &coor,LatticePropagator &source)
+{
+  //  Coordinate coor({0,0,0,0});
+  source=Zero();
+  SpinColourMatrix kronecker; kronecker=1.0;
+  pokeSite(kronecker,source,coor);
+}
+void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex noise(grid);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+
+  RealD nrm=1.0/sqrt(2);
+  bernoulli(RNG, noise); // 0,1 50:50
+
+  noise = (2.*noise - Complex(1,1))*nrm;
+
+  LatticeCoordinate(t,Tdir);
+  noise = where(t==Integer(tslice), noise, zz);
+
+  source = 1.0;
+  source = source*noise;
+  std::cout << " Z2 wall " << norm2(source) << std::endl;
+}
+template<class Field>
+void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
+{
+  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  Laplacian_t Laplacian(U);
+
+  Integer Iterations = 40;
+  Real width = 2.0;
+  Real coeff = (width*width) / Real(4*Iterations);
+
+  Field tmp(U.Grid());
+  smeared=unsmeared;
+  //  chi = (1-p^2/2N)^N kronecker
+  for(int n = 0; n < Iterations; ++n) {
+    Laplacian.M(smeared,tmp);
+    smeared = smeared - coeff*tmp;
+    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
+  }
+}
+void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
+{
+  LatticePropagator tmp(source.Grid());
+  PointSource(site,source);
+  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
+  tmp = source;
+  GaussianSmear(U,tmp,source);
+  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
+}
+void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
+{
+  Z2WallSource(RNG,tslice,source);
+  auto tmp = source;
+  GaussianSmear(U,tmp,source);
+}
+void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
+{
+  assert(mom.size()==Nd);
+  assert(mom[Tdir] == 0);
+
+  GridBase * grid = spectator.Grid();
+
+
+  LatticeInteger ts(grid);
+  LatticeCoordinate(ts,Tdir);
+  source = Zero();
+  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
+
+  LatticeComplex phase(grid);
+  MakePhase(mom,phase);
+
+  source = source *phase;
+}
+
+template<class Action>
+void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+ GridBase *UGrid = source.Grid();
+  GridBase *FGrid = D.FermionGrid();
+  bool fiveD = true; //calculate 5d free propagator
+  RealD mass = D.Mass();
+  LatticeFermion src4  (UGrid);
+  LatticeFermion result4  (UGrid);
+  LatticeFermion result5(FGrid);
+  LatticeFermion src5(FGrid);
+  LatticePropagator prop5(FGrid);
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+ 
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+      D.FreePropagator(src5,result5,mass,true);
+      std::cout<<GridLogMessage
+               <<"Free 5D prop spin "<<s<<" color "<<c
+               <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+
+  LatticePropagator Vector_mu(UGrid);
+  LatticeComplex    VV (UGrid);
+  std::vector<TComplex> sumVV;
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    int Nt = sumVV.size();
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+}
+template<class Action>
+void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+  bool fiveD = false; //calculate 4d free propagator
+  RealD mass = D.Mass();
+  GridBase *UGrid = source.Grid();
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion result4  (UGrid); 
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+      D.FreePropagator(src4,result4,mass,false);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+}
+
+template<class Action>
+void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{
+  GridBase *UGrid = D.GaugeGrid();
+  GridBase *FGrid = D.FermionGrid();
+
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion src5  (FGrid); 
+  LatticeFermion result5(FGrid);
+  LatticeFermion result4(UGrid);
+  LatticePropagator prop5(FGrid);
+  
+  ConjugateGradient<LatticeFermion> CG(1.0e-7,100000);
+  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
+   for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+
+      result5=Zero();
+      schur(D,src5,result5,ZG);
+      std::cout<<GridLogMessage
+	       <<"spin "<<s<<" color "<<c
+	       <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+  LatticePropagator Axial_mu(UGrid); 
+  LatticePropagator Vector_mu(UGrid); 
+
+  LatticeComplex    PA (UGrid); 
+  LatticeComplex    VV (UGrid); 
+  LatticeComplex    PJ5q(UGrid);
+  LatticeComplex    PP (UGrid);
+
+  std::vector<TComplex> sumPA;
+  std::vector<TComplex> sumVV;
+  std::vector<TComplex> sumPP;
+  std::vector<TComplex> sumPJ5q;
+
+  Gamma g5(Gamma::Algebra::Gamma5);
+  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
+  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
+  sliceSum(PA,sumPA,Tdir);
+
+  int Nt{static_cast<int>(sumPA.size())};
+
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
+
+  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
+  sliceSum(PP,sumPP,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
+  
+  D.ContractJ5q(prop5,PJ5q);
+  sliceSum(PJ5q,sumPJ5q,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
+
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    //    auto ss=sliceSum(Vector_mu,Tdir);
+    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+
+}
+
+class MesonFile: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
+};
+
+void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
+    {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
+    {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
+    {Gamma::Algebra::Identity,Gamma::Algebra::Identity}
+  };
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
+      RealD Ct = real(corr[t]);
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
+		<< " deltaC " <<Ct-Cont<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+int main (int argc, char ** argv)
+{
+
+  Grid_init(&argc,&argv);
+  int Ls= atoi(getenv("Ls"));
+
+  // Double precision grids
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  //  std::vector<int> seeds4({1,2,3,4}); 
+  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeGaugeField Umu(UGrid);
+  std::string config;
+  RealD M5=atof(getenv("M5"));
+  RealD mq = atof(getenv("mass"));
+  int   point_x = atoi(getenv("point_x"));
+  int   point_y = atoi(getenv("point_y"));
+  int   point_z = atoi(getenv("point_z"));
+  int   point_t = atoi(getenv("point_t"));
+  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
+  if( argc > 1 && argv[1][0] != '-' )
+  {
+    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, argv[1]);
+    config=argv[1];
+    LLscale = 1.0;
+    LCscale = 1.0;
+  } else {
+    printf("Expected a configuration");
+    exit(0);
+  }
+
+  int nmass = masses.size();
+
+  typedef MobiusFermionR FermionActionR;
+  std::vector<FermionActionR *> FermActs;
+  
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+
+  for(auto mass: masses) {
+    std::vector<Complex> boundary = {1,1,1,-1};
+    FermionActionR::ImplParams Params(boundary);
+    RealD b=1.5;
+    RealD c=0.5;
+    FermActs.push_back(new FermionActionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
+  }
+
+  LatticePropagator point_source(UGrid);
+
+  Coordinate Origin({point_x,point_y,point_z,point_t});
+  PointSource   (Origin,point_source);
+  
+  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+
+  for(int m=0;m<nmass;m++) {
+    Solve(*FermActs[m],point_source   ,PointProps[m]);
+  }
+
+  LatticeComplex phase(UGrid);
+  Coordinate mom({0,0,0,0});
+  MakePhase(mom,phase);
+  
+  for(int m1=0 ;m1<nmass;m1++) {
+  for(int m2=m1;m2<nmass;m2++) {
+    std::stringstream ssp,ssg,ssz;
+
+    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
+
+    std::cout << "CG determined VV correlation function"<<std::endl;
+    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
+    
+  }}
+
+  Grid_finalize();
+}
+
+
+
--- a/examples/Example_taku1.cc
+++ b/examples/Example_taku1.cc
@ -0,0 +1,479 @@
+/*
+ * Warning: This code illustrative only: not well tested, and not meant for production use
+ * without regression / tests being applied
+ */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+RealD LLscale =1.0;
+RealD LCscale =1.0;
+
+template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  GridBase *grid;
+  GaugeField U;
+  
+  CovariantLaplacianCshift(GaugeField &_U)    :
+    grid(_U.Grid()),
+    U(_U) {  };
+
+  virtual GridBase *Grid(void) { return grid; };
+
+  virtual void  M    (const Field &in, Field &out)
+  {
+    out=Zero();
+    for(int mu=0;mu<Nd-1;mu++) {
+      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
+      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
+      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
+      out = out + 2.0*in;
+    }
+  };
+  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
+  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
+};
+
+void MakePhase(Coordinate mom,LatticeComplex &phase)
+{
+  GridBase *grid = phase.Grid();
+  auto latt_size = grid->GlobalDimensions();
+  ComplexD ci(0.0,1.0);
+  phase=Zero();
+
+  LatticeComplex coor(phase.Grid());
+  for(int mu=0;mu<Nd;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    phase = phase + (TwoPiL * mom[mu]) * coor;
+  }
+  phase = exp(phase*ci);
+}
+
+void PointSource(Coordinate &coor,LatticePropagator &source)
+{
+  //  Coordinate coor({0,0,0,0});
+  source=Zero();
+  SpinColourMatrix kronecker; kronecker=1.0;
+  pokeSite(kronecker,source,coor);
+}
+void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex noise(grid);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+
+  RealD nrm=1.0/sqrt(2);
+  bernoulli(RNG, noise); // 0,1 50:50
+
+  noise = (2.*noise - Complex(1,1))*nrm;
+
+  LatticeCoordinate(t,Tdir);
+  noise = where(t==Integer(tslice), noise, zz);
+
+  source = 1.0;
+  source = source*noise;
+  std::cout << " Z2 wall " << norm2(source) << std::endl;
+}
+template<class Field>
+void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
+{
+  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  Laplacian_t Laplacian(U);
+
+  Integer Iterations = 40;
+  Real width = 2.0;
+  Real coeff = (width*width) / Real(4*Iterations);
+
+  Field tmp(U.Grid());
+  smeared=unsmeared;
+  //  chi = (1-p^2/2N)^N kronecker
+  for(int n = 0; n < Iterations; ++n) {
+    Laplacian.M(smeared,tmp);
+    smeared = smeared - coeff*tmp;
+    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
+  }
+}
+void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
+{
+  LatticePropagator tmp(source.Grid());
+  PointSource(site,source);
+  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
+  tmp = source;
+  GaussianSmear(U,tmp,source);
+  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
+}
+void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
+{
+  Z2WallSource(RNG,tslice,source);
+  auto tmp = source;
+  GaussianSmear(U,tmp,source);
+}
+void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
+{
+  assert(mom.size()==Nd);
+  assert(mom[Tdir] == 0);
+
+  GridBase * grid = spectator.Grid();
+
+
+  LatticeInteger ts(grid);
+  LatticeCoordinate(ts,Tdir);
+  source = Zero();
+  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
+
+  LatticeComplex phase(grid);
+  MakePhase(mom,phase);
+
+  source = source *phase;
+}
+
+template<class Action>
+void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+ GridBase *UGrid = source.Grid();
+  GridBase *FGrid = D.FermionGrid();
+  bool fiveD = true; //calculate 5d free propagator
+  RealD mass = D.Mass();
+  LatticeFermion src4  (UGrid);
+  LatticeFermion result4  (UGrid);
+  LatticeFermion result5(FGrid);
+  LatticeFermion src5(FGrid);
+  LatticePropagator prop5(FGrid);
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+ 
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+      D.FreePropagator(src5,result5,mass,true);
+      std::cout<<GridLogMessage
+               <<"Free 5D prop spin "<<s<<" color "<<c
+               <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+
+  LatticePropagator Vector_mu(UGrid);
+  LatticeComplex    VV (UGrid);
+  std::vector<TComplex> sumVV;
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    int Nt = sumVV.size();
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+}
+template<class Action>
+void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+  bool fiveD = false; //calculate 4d free propagator
+  RealD mass = D.Mass();
+  GridBase *UGrid = source.Grid();
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion result4  (UGrid); 
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+      D.FreePropagator(src4,result4,mass,false);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+}
+
+template<class Action>
+void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{
+  GridBase *UGrid = D.GaugeGrid();
+  GridBase *FGrid = D.FermionGrid();
+
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion src5  (FGrid); 
+  LatticeFermion result5(FGrid);
+  LatticeFermion result4(UGrid);
+  LatticePropagator prop5(FGrid);
+  
+  ConjugateGradient<LatticeFermion> CG(1.0e-10,100000);
+  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
+   for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+
+      result5=Zero();
+      schur(D,src5,result5,ZG);
+      std::cout<<GridLogMessage
+	       <<"spin "<<s<<" color "<<c
+	       <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+  LatticePropagator Axial_mu(UGrid); 
+  LatticePropagator Vector_mu(UGrid); 
+
+  LatticeComplex    PA (UGrid); 
+  LatticeComplex    VV (UGrid); 
+  LatticeComplex    PJ5q(UGrid);
+  LatticeComplex    PP (UGrid);
+
+  std::vector<TComplex> sumPA;
+  std::vector<TComplex> sumVV;
+  std::vector<TComplex> sumPP;
+  std::vector<TComplex> sumPJ5q;
+
+  Gamma g5(Gamma::Algebra::Gamma5);
+  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
+  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
+  sliceSum(PA,sumPA,Tdir);
+
+  int Nt{static_cast<int>(sumPA.size())};
+
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
+
+  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
+  sliceSum(PP,sumPP,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
+  
+  D.ContractJ5q(prop5,PJ5q);
+  sliceSum(PJ5q,sumPJ5q,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
+
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    //    auto ss=sliceSum(Vector_mu,Tdir);
+    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+
+}
+
+class MesonFile: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
+};
+
+void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
+    {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
+    {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
+    {Gamma::Algebra::Identity,Gamma::Algebra::Identity}
+  };
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
+      RealD Ct = real(corr[t]);
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
+		<< " deltaC " <<Ct-Cont<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+int main (int argc, char ** argv)
+{
+  const int Ls=10;
+
+  Grid_init(&argc,&argv);
+
+  // Double precision grids
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  //  std::vector<int> seeds4({1,2,3,4}); 
+  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeGaugeField Umu(UGrid);
+  std::string config;
+  RealD M5=atof(getenv("M5"));
+  RealD mq = atof(getenv("mass"));
+  int   tadpole = atof(getenv("tadpole"));
+  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
+  if( argc > 1 && argv[1][0] != '-' )
+  {
+    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, argv[1]);
+    config=argv[1];
+    LLscale = 1.0;
+    LCscale = 1.0;
+  }
+  else
+  {
+    SU<Nc>::ColdConfiguration(Umu);
+    config="ColdConfig";
+    //    RealD P=1.0; // Don't scale
+    //    RealD P=0.6388238 // 32Ifine
+    //    RealD P=0.6153342; // 64I
+    RealD P=0.5871119; // 48I
+    RealD u0 = sqrt(sqrt(P));
+    RealD w0 = 1 - M5;
+    std::cout<<GridLogMessage <<"For plaquette P="<<P<<" u0= "<<u0<<std::endl;
+    if ( tadpole == 1 ) {
+      Umu = Umu * u0;
+      //      LLscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
+      //      LCscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
+      LLscale = 1.0;
+      LCscale = 1.0;
+      std::cout<<GridLogMessage <<"Gauge links are u= u0 "<<std::endl;
+      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
+    } else if ( tadpole == 2) {
+      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
+      LLscale = 1.0;
+      LCscale = 1.0;
+      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
+    } else {
+      LLscale = 1.0/u0/u0;
+      LCscale = 1.0/u0/u0;
+      M5 = M5 - 4.0 * (1-u0);
+      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
+      std::cout<<GridLogMessage <<"M5mf =  "<<M5<<std::endl;
+    }
+    std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl;
+    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
+    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
+  }
+
+  int nmass = masses.size();
+
+  typedef DomainWallFermionR FermionActionR;
+  //  typedef MobiusFermionR FermionActionR;
+  std::vector<FermionActionR *> FermActs;
+  std::vector<DomainWallFermionR *> DWFActs;
+  
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+
+  for(auto mass: masses) {
+    std::vector<Complex> boundary = {1,1,1,-1};
+    FermionActionR::ImplParams Params(boundary);
+    RealD b=1.5;
+    RealD c=0.5;
+    std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
+    //    DWFActs.push_back(new DomainWallFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
+    FermActs.push_back(new FermionActionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params));
+    //    FermActs.push_back(new FermionActionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass+0.001,M5,b,c));
+    std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
+  }
+
+  LatticePropagator point_source(UGrid);
+
+  Coordinate Origin({0,0,0,0});
+  PointSource   (Origin,point_source);
+  
+  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  //  std::vector<LatticePropagator> FreeProps(nmass,UGrid);
+  //  LatticePropagator delta(UGrid);
+
+  for(int m=0;m<nmass;m++) {
+    Solve(*FermActs[m],point_source   ,PointProps[m]);
+    //    MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]);
+
+    //    delta = PointProps[m] - FreeProps[m];
+    //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
+  }
+
+  LatticeComplex phase(UGrid);
+  Coordinate mom({0,0,0,0});
+  MakePhase(mom,phase);
+  
+  for(int m1=0 ;m1<nmass;m1++) {
+  for(int m2=m1;m2<nmass;m2++) {
+    std::stringstream ssp,ssg,ssz;
+
+    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
+
+    std::cout << "CG determined VV correlation function"<<std::endl;
+    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
+    
+    //    std::cout << "FFT derived VV correlation function"<<std::endl;
+    //    MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
+  }}
+
+  Grid_finalize();
+}
+
+
+
--- a/tests/IO/Test_field_array_io.cc
+++ b/tests/IO/Test_field_array_io.cc
@ -0,0 +1,184 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/IO/Test_field_array_io.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+//This test demonstrates and checks a single-file write of an arbitrary array of fields
+
+uint64_t writeHeader(const uint32_t size, const uint32_t checksum, const std::string &format, const std::string &file){
+  std::ofstream fout(file,std::ios::out|std::ios::in);
+  fout.seekp(0,std::ios::beg);
+  fout << std::setw(10) << size << std::endl;
+  fout << std::hex << std::setw(10) << checksum << std::endl;
+  fout << format << std::endl;
+  return fout.tellp();
+}
+ 
+uint64_t readHeader(uint32_t &size, uint32_t &checksum, std::string &format, const std::string &file){
+  std::ifstream fin(file);
+  std::string line;
+  getline(fin,line);
+  {
+    std::stringstream ss; ss <<line ; ss >> size;
+  }
+  getline(fin,line);
+  {
+    std::stringstream ss; ss <<line ; ss >> std::hex >> checksum;
+  }
+  getline(fin,format);
+  removeWhitespace(format);
+      
+  return fin.tellg();
+}
+ 
+template<typename FieldType>
+void writeFieldArray(const std::string &file, const std::vector<FieldType> &data){
+  typedef typename FieldType::vector_object vobj;
+  typedef typename FieldType::scalar_object sobj;
+  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
+  BinarySimpleMunger<sobj, sobj> munge; //straight copy
+
+  //We need a 2-pass header write, first to establish the size, the second pass writes the checksum
+  std::string format = getFormatString<typename FieldType::vector_object>();
+
+  uint64_t offset; //leave 64 bits for header
+  if ( grid->IsBoss() ) { 
+    NerscIO::truncate(file);
+    offset = writeHeader(data.size(), 0, format, file);
+  }
+  grid->Broadcast(0,(void *)&offset,sizeof(offset)); //use as a barrier
+
+  std::cout << "Data offset write " << offset << std::endl;
+  std::cout << "Data size write " << data.size() << std::endl;
+  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
+  std::cout << "Field size = " << field_size << " B" << std::endl;
+
+  uint32_t checksum = 0;
+  for(int i=0;i<data.size();i++){
+    std::cout << "Data field write " << i << " offset " << offset << std::endl;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    BinaryIO::writeLatticeObject<vobj,sobj>(const_cast<FieldType &>(data[i]),file,munge,offset,format,
+					    nersc_csum,scidac_csuma,scidac_csumb);
+    offset += field_size;
+    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
+  }
+  std::cout << "Write checksum " << checksum << std::endl;
+
+  if ( grid->IsBoss() ) { 
+    writeHeader(data.size(), checksum, format, file);
+  }
+}
+
+
+template<typename FieldType>
+void readFieldArray(std::vector<FieldType> &data, const std::string &file){
+  typedef typename FieldType::vector_object vobj;
+  typedef typename FieldType::scalar_object sobj;
+  assert(data.size() > 0);
+  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
+  BinarySimpleUnmunger<sobj, sobj> munge; //straight copy
+  
+  uint32_t hdr_checksum, hdr_size;
+  std::string format;
+  uint64_t offset = readHeader(hdr_size, hdr_checksum, format, file);
+  
+  std::cout << "Data offset read " << offset << std::endl;  
+  std::cout << "Data size read " << hdr_size << std::endl;
+  assert(data.size() == hdr_size);
+
+  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
+
+  uint32_t checksum = 0;
+
+  for(int i=0;i<data.size();i++){
+    std::cout << "Data field read " << i << " offset " << offset << std::endl;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    BinaryIO::readLatticeObject<vobj,sobj>(data[i],file,munge,offset,format,
+					   nersc_csum,scidac_csuma,scidac_csumb);
+    offset += field_size;
+    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
+  }
+
+  std::cout << "Header checksum " << hdr_checksum << std::endl;    
+  std::cout << "Read checksum " << checksum << std::endl;
+    
+
+  assert( hdr_checksum == checksum );
+}
+
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  const int Ls=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt, simd_layout, mpi_layout);
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  typedef DomainWallFermionD::FermionField FermionField;
+
+  int nfield = 20;
+  std::vector<FermionField> data(nfield, FGrid);
+
+  for(int i=0;i<data.size();i++)
+    gaussian(RNG5, data[i]);
+  
+  std::string file = "test_field_array_io.0";
+  writeFieldArray(file, data);
+
+  std::vector<FermionField> data_r(nfield, FGrid);
+  readFieldArray(data_r, file);
+  
+  for(int i=0;i<nfield;i++){
+    FermionField diff = data_r[i] - data[i];
+    RealD norm_diff = norm2(diff);
+    std::cout << "Norm2 of difference between stored and loaded data index " << i << " : " << norm_diff << std::endl;
+  }
+  
+  std::cout << "Done" << std::endl;
+
+  Grid_finalize();
+}
--- a/tests/lanczos/Test_compressed_lanczos_gparity.cc
+++ b/tests/lanczos/Test_compressed_lanczos_gparity.cc
@ -0,0 +1,485 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_compressed_lanczos_gparity.cc
+
+    Copyright (C) 2017
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Leans heavily on Christoph Lehner's code
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+/*
+ *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
+ *  in Grid that were intended to be used to support blocked Aggregates, from
+ */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+using namespace std;
+using namespace Grid;
+
+//For the CPS configurations we have to manually seed the RNG and deal with an incorrect factor of 2 in the plaquette metadata
+void readConfiguration(LatticeGaugeFieldD &U,
+		       const std::string &config,
+		       bool is_cps_cfg = false){
+
+  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = false;
+
+  typedef GaugeStatistics<ConjugateGimplD> GaugeStats;
+     
+  FieldMetaData header;
+  NerscIO::readConfiguration<GaugeStats>(U, header, config);
+
+  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = true;
+}
+
+//Lanczos parameters in CPS conventions
+struct CPSLanczosParams : Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(CPSLanczosParams,
+				  RealD, alpha,
+				  RealD, beta,
+				  int, ch_ord,
+				  int, N_use,
+				  int, N_get,
+				  int, N_true_get,
+				  RealD, stop_rsd,
+				  int, maxits);
+
+  //Translations
+  ChebyParams getChebyParams() const{
+    ChebyParams out;
+    out.alpha = beta*beta; //aka lo
+    out.beta = alpha*alpha; //aka hi
+    out.Npoly = ch_ord+1;
+    return out;
+  }
+  int Nstop() const{ return N_true_get; }
+  int Nm() const{ return N_use; }
+  int Nk() const{ return N_get; }
+};
+
+//Maybe this class should be in the main library?
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
+{ 
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
+			      LinearOperatorBase<FineField> &FineOp,
+			      int checkerboard) 
+    // Base constructor
+    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
+  {};
+
+  void checkpointFine(std::string evecs_file,std::string evals_file)
+  {
+    assert(this->subspace.size()==nbasis);
+    emptyUserRecord record;
+    Grid::ScidacWriter WR(this->_FineGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      WR.writeScidacFieldRecord(this->subspace[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_fine);
+  }
+
+  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
+  {
+    this->evals_fine.resize(nbasis);
+    this->subspace.resize(nbasis,this->_FineGrid);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_fine);
+
+    if(this->evals_fine.size() < nbasis) assert(0 && "Not enough fine evals to complete basis");
+    if(this->evals_fine.size() > nbasis){ //allow the use of precomputed evecs with a larger #evecs
+      std::cout << GridLogMessage << "Truncating " << this->evals_fine.size() << " evals to basis size " << nbasis << std::endl;
+      this->evals_fine.resize(nbasis);
+    }     
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      this->subspace[k].Checkerboard()=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
+      
+    }
+    RD.close();
+  }
+
+  void checkpointCoarse(std::string evecs_file,std::string evals_file)
+  {
+    int n = this->evec_coarse.size();
+    emptyUserRecord record;
+    Grid::ScidacWriter WR(this->_CoarseGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<n;k++) {
+      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_coarse);
+  }
+
+  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
+  {
+    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
+    this->evals_coarse.resize(nvec);
+    this->evec_coarse.resize(nvec,this->_CoarseGrid);
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_coarse);
+
+    assert(this->evals_coarse.size()==nvec);
+    emptyUserRecord record;
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    Grid::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nvec;k++) {
+      RD.readScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    RD.close();
+  }
+};
+
+struct Options{
+  std::vector<int> blockSize;
+  std::vector<int> GparityDirs;
+  int Ls;
+  RealD mass;
+  RealD M5;
+  RealD mobius_scale;
+  std::string config;
+  bool is_cps_cfg;
+
+  double coarse_relax_tol;
+  int smoother_ord;
+  
+  CPSLanczosParams fine;
+  CPSLanczosParams coarse;
+
+  bool write_fine = false;
+  std::string write_fine_file;
+
+  bool read_fine = false;
+  std::string read_fine_file;
+
+  bool write_coarse = false;
+  std::string write_coarse_file;
+
+  bool read_coarse = false;
+  std::string read_coarse_file;
+
+  
+  Options(){
+    blockSize = std::vector<int> ({2,2,2,2,2});
+    GparityDirs = std::vector<int> ({1,1,1}); //1 for each GP direction
+    
+    Ls = 12;
+    mass = 0.01;
+    M5 = 1.8;
+    is_cps_cfg = false;
+    mobius_scale = 2.0;
+    
+    fine.alpha = 2;
+    fine.beta = 0.1;
+    fine.ch_ord = 100;
+    fine.N_use = 70;
+    fine.N_get = 60;
+    fine.N_true_get = 60;
+    fine.stop_rsd = 1e-8;
+    fine.maxits = 10000;
+
+    coarse.alpha = 2;
+    coarse.beta = 0.1;
+    coarse.ch_ord = 100;
+    coarse.N_use = 200;
+    coarse.N_get = 190;
+    coarse.N_true_get = 190;
+    coarse.stop_rsd = 1e-8;
+    coarse.maxits = 10000;
+
+    coarse_relax_tol = 1e5;
+    smoother_ord = 20;
+
+    write_fine = false;
+    read_fine = false;
+    write_coarse = false;
+    read_coarse = false;
+  }
+};  
+
+template<int nbasis>
+void runTest(const Options &opt){
+	        //Fine grids
+  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),  GridDefaultSimd(Nd,vComplex::Nsimd()),   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(opt.Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(opt.Ls,UGrid);
+
+  //Setup G-parity BCs
+  assert(Nd == 4);
+  std::vector<int> dirs4(4);
+  for(int i=0;i<3;i++) dirs4[i] = opt.GparityDirs[i];
+  dirs4[3] = 0; //periodic gauge BC in time
+  
+  std::cout << GridLogMessage << "Gauge BCs: " << dirs4 << std::endl;
+  ConjugateGimplD::setDirections(dirs4); //gauge BC
+
+  GparityWilsonImplD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = opt.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+  std::cout << GridLogMessage << "Fermion BCs: " << Params.twists << std::endl;
+  
+  //Read the gauge field
+  LatticeGaugeField Umu(UGrid);  
+  readConfiguration(Umu, opt.config, opt.is_cps_cfg);
+
+  //Setup the coarse grids  
+  auto fineLatt     = GridDefaultLatt();
+  Coordinate coarseLatt(4);
+  for (int d=0;d<4;d++){
+    coarseLatt[d] = fineLatt[d]/opt.blockSize[d];    assert(coarseLatt[d]*opt.blockSize[d]==fineLatt[d]);
+  }
+
+  std::cout << GridLogMessage<< " 5d coarse lattice is ";
+  for (int i=0;i<4;i++){
+    std::cout << coarseLatt[i]<<"x";
+  } 
+  int cLs = opt.Ls/opt.blockSize[4]; assert(cLs*opt.blockSize[4]==opt.Ls);
+  std::cout << cLs<<std::endl;
+  
+  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
+  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
+
+  //Dirac operator
+  double bmc =  1.;      
+  double b = (opt.mobius_scale + bmc)/2.;  // b = 1/2 [ (b+c) + (b-c) ]
+  double c = (opt.mobius_scale - bmc)/2.;  // c = 1/2 [ (b+c) - (b-c) ]
+  
+  GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, opt.mass, opt.M5, b,c,Params);
+  typedef GparityMobiusFermionD::FermionField FermionField;
+  
+  SchurDiagTwoOperator<GparityMobiusFermionD, FermionField> SchurOp(action);
+
+  typedef GparityWilsonImplD::SiteSpinor SiteSpinor;
+
+  const CPSLanczosParams &fine = opt.fine;
+  const CPSLanczosParams &coarse = opt.coarse;
+
+  std::cout << GridLogMessage << "Keep " << fine.N_true_get   << " fine   vectors" << std::endl;
+  std::cout << GridLogMessage << "Keep " << coarse.N_true_get << " coarse vectors" << std::endl;
+  assert(coarse.N_true_get >= fine.N_true_get);
+
+  assert(nbasis<=fine.N_true_get);
+  LocalCoherenceLanczosScidac<SiteSpinor,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,SchurOp,Odd);
+  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
+ 
+  //Compute and/or read fine evecs
+  if(opt.read_fine){
+    _LocalCoherenceLanczos.checkpointFineRestore(opt.read_fine_file + "_evecs.scidac", opt.read_fine_file + "_evals.xml");
+  }else{
+    std::cout << GridLogMessage << "Performing fine grid IRL" << std::endl;
+    std::cout << GridLogMessage << "Using Chebyshev alpha=" << fine.alpha << " beta=" << fine.beta << " ord=" << fine.ch_ord << std::endl;
+    _LocalCoherenceLanczos.calcFine(fine.getChebyParams(),
+				    fine.Nstop(),fine.Nk(),fine.Nm(),
+				    fine.stop_rsd,fine.maxits,0,0);
+    if(opt.write_fine){
+      std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
+      _LocalCoherenceLanczos.checkpointFine(opt.write_fine_file + "_evecs.scidac", opt.write_fine_file + "_evals.xml");
+    }
+  }
+  
+  //Block orthonormalise (this should be part of calcFine?)
+  std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
+  _LocalCoherenceLanczos.Orthogonalise();
+  std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
+
+  ChebyParams smoother = fine.getChebyParams();
+  smoother.Npoly = opt.smoother_ord+1;
+
+  if(opt.read_coarse){
+    _LocalCoherenceLanczos.checkpointCoarseRestore(opt.read_coarse_file + "_evecs.scidac", opt.read_coarse_file + "_evals.xml",coarse.Nstop());
+
+  }else{
+    std::cout << GridLogMessage << "Performing coarse grid IRL" << std::endl;
+    std::cout << GridLogMessage << "Using Chebyshev alpha=" << coarse.alpha << " beta=" << coarse.beta << " ord=" << coarse.ch_ord << std::endl;	
+    _LocalCoherenceLanczos.calcCoarse(coarse.getChebyParams(), smoother, opt.coarse_relax_tol,
+				      coarse.Nstop(), coarse.Nk() ,coarse.Nm(),
+				      coarse.stop_rsd, coarse.maxits, 
+				      0,0);
+
+    if(opt.write_coarse){
+      std::cout << GridLogIRL<<"Checkpointing Coarse evecs"<<std::endl;
+      _LocalCoherenceLanczos.checkpointCoarse(opt.write_coarse_file + "_evecs.scidac", opt.write_coarse_file + "_evals.xml");
+    }
+
+  }
+
+  //Test the eigenvectors
+  //To remove high-frequency noise we apply a Chebyshev smoothing
+  Chebyshev<FermionField> cheb_smoother(smoother);
+    
+  FermionField evec(FrbGrid);
+  FermionField evec_sm(FrbGrid); //smoothed
+  FermionField tmp(FrbGrid);
+  RealD eval;
+  
+  for(int i=0;i<coarse.N_true_get;i++){    
+    _LocalCoherenceLanczos.getFineEvecEval(evec, eval, i);
+
+    //Check unsmoothed evec
+    SchurOp.HermOp(evec, tmp);
+    tmp = tmp - eval*evec;
+    RealD norm_unsmoothed = sqrt(norm2(tmp));
+    
+    //Check smoothed evec
+    cheb_smoother(SchurOp, evec, evec_sm);   
+    SchurOp.HermOp(evec_sm, tmp);
+    tmp = tmp - eval*evec_sm;
+    RealD norm_smoothed = sqrt(norm2(tmp));
+    
+    std::cout << GridLogMessage << "Eval " << eval << " unsmoothed resid " << norm_unsmoothed << " smoothed resid " << norm_smoothed << std::endl;
+  }
+}
+
+
+//Note:  because we rely upon physical properties we must use a "real" gauge configuration
+int main (int argc, char ** argv) {
+  Grid_init(&argc,&argv);
+  GridLogIRL.TimingMode(1);
+
+  Options opt;
+  int basis_size = 100;
+  
+  if(argc < 3){
+    std::cout << GridLogMessage << "Usage: <exe> <config> <gparity dirs> <options>" << std::endl;
+    std::cout << GridLogMessage << "<gparity dirs> should have the format a.b.c where a,b,c are 0,1 depending on whether there are G-parity BCs in that direction" << std::endl;
+    std::cout << GridLogMessage << "Options:" << std::endl;
+    std::cout << GridLogMessage << "--Ls <value> : Set Ls (default 12)" << std::endl;
+    std::cout << GridLogMessage << "--mass <value> : Set the mass (default 0.01)" << std::endl;
+    std::cout << GridLogMessage << "--block <value> : Set the block size. Format should be a.b.c.d.e where a-e are the block extents  (default 2.2.2.2.2)" << std::endl;
+    std::cout << GridLogMessage << "--is_cps_cfg : Indicate that the configuration was generated with CPS where until recently the stored plaquette was wrong by a factor of 2" << std::endl;
+    std::cout << GridLogMessage << "--write_irl_templ: Write a template for the parameters file of the Lanczos to \"irl_templ.xml\"" << std::endl;
+    std::cout << GridLogMessage << "--read_irl_fine <filename>: Real the parameters file for the fine Lanczos" << std::endl;
+    std::cout << GridLogMessage << "--read_irl_coarse <filename>: Real the parameters file for the coarse Lanczos" << std::endl;
+    std::cout << GridLogMessage << "--write_fine <filename stub>: Write fine evecs/evals to filename starting with the stub" << std::endl;
+    std::cout << GridLogMessage << "--read_fine <filename stub>: Read fine evecs/evals from filename starting with the stub" << std::endl;
+    std::cout << GridLogMessage << "--write_coarse <filename stub>: Write coarse evecs/evals to filename starting with the stub" << std::endl;
+    std::cout << GridLogMessage << "--read_coarse <filename stub>: Read coarse evecs/evals from filename starting with the stub" << std::endl;
+    std::cout << GridLogMessage << "--smoother_ord :  Set the Chebyshev order of the smoother (default 20)" << std::endl;
+    std::cout << GridLogMessage << "--coarse_relax_tol : Set the relaxation parameter for evaluating the residual of the reconstructed eigenvectors outside of the basis (default 1e5)" << std::endl;
+    std::cout << GridLogMessage << "--basis_size : Select the basis size from 100,200,300,350 (default 100)" << std::endl;
+    Grid_finalize();
+    return 1;
+  }
+  opt.config = argv[1];
+  GridCmdOptionIntVector(argv[2], opt.GparityDirs);
+  assert(opt.GparityDirs.size() == 3);
+
+  for(int i=3;i<argc;i++){
+    std::string sarg = argv[i];
+    if(sarg == "--Ls"){
+      opt.Ls = std::stoi(argv[i+1]);
+      std::cout << GridLogMessage << "Set Ls to " << opt.Ls << std::endl;
+    }else if(sarg == "--mass"){
+      std::istringstream ss(argv[i+1]); ss >> opt.mass;
+      std::cout << GridLogMessage << "Set quark mass to " << opt.mass << std::endl;
+    }else if(sarg == "--block"){
+      GridCmdOptionIntVector(argv[i+1], opt.blockSize);
+      assert(opt.blockSize.size() == 5);
+      std::cout << GridLogMessage << "Set block size to ";
+      for(int q=0;q<5;q++) std::cout << opt.blockSize[q] << " ";
+      std::cout << std::endl;      
+    }else if(sarg == "--is_cps_cfg"){
+      opt.is_cps_cfg = true;
+    }else if(sarg == "--write_irl_templ"){
+      XmlWriter writer("irl_templ.xml");
+      write(writer,"Params", opt.fine);
+      Grid_finalize();
+      return 0;
+    }else if(sarg == "--read_irl_fine"){
+      std::cout << GridLogMessage << "Reading fine IRL params from " << argv[i+1] << std::endl;
+      XmlReader reader(argv[i+1]);
+      read(reader, "Params", opt.fine);
+    }else if(sarg == "--read_irl_coarse"){
+      std::cout << GridLogMessage << "Reading coarse IRL params from " << argv[i+1] << std::endl;
+      XmlReader reader(argv[i+1]);
+      read(reader, "Params", opt.coarse);
+    }else if(sarg == "--write_fine"){
+      opt.write_fine = true;
+      opt.write_fine_file = argv[i+1];
+    }else if(sarg == "--read_fine"){
+      opt.read_fine = true;
+      opt.read_fine_file = argv[i+1];
+    }else if(sarg == "--write_coarse"){
+      opt.write_coarse = true;
+      opt.write_coarse_file = argv[i+1];
+    }else if(sarg == "--read_coarse"){
+      opt.read_coarse = true;
+      opt.read_coarse_file = argv[i+1];
+    }else if(sarg == "--smoother_ord"){
+      std::istringstream ss(argv[i+1]); ss >> opt.smoother_ord;
+      std::cout << GridLogMessage << "Set smoother order to " << opt.smoother_ord << std::endl;
+    }else if(sarg == "--coarse_relax_tol"){
+      std::istringstream ss(argv[i+1]); ss >> opt.coarse_relax_tol;
+      std::cout << GridLogMessage << "Set coarse IRL relaxation parameter to " << opt.coarse_relax_tol << std::endl;
+    }else if(sarg == "--mobius_scale"){
+      std::istringstream ss(argv[i+1]); ss >> opt.mobius_scale;
+      std::cout << GridLogMessage << "Set Mobius scale to " << opt.mobius_scale << std::endl;
+    }else if(sarg == "--basis_size"){
+      basis_size = std::stoi(argv[i+1]);
+      std::cout << GridLogMessage << "Set basis size to " << basis_size << std::endl;
+    }
+  }
+
+  switch(basis_size){
+  case 100:
+    runTest<100>(opt); break;
+  case 200:
+    runTest<200>(opt); break;
+  case 300:
+    runTest<300>(opt); break;
+  case 350:
+    runTest<350>(opt); break;
+  default:
+    std::cout << GridLogMessage << "Unsupported basis size " << basis_size << std::endl;
+    assert(0);
+  }
+  
+  Grid_finalize();
+}
+
--- a/tests/lanczos/Test_evec_compression.cc
+++ b/tests/lanczos/Test_evec_compression.cc
@ -0,0 +1,582 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_evec_compression.cc
+
+    Copyright (C) 2017
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+/*
+ *
+ * This test generates eigenvectors using the Lanczos algorithm then attempts to use local coherence compression
+ * to express those vectors in terms of a basis formed from a subset. This test is useful for finding the optimal
+ * blocking and basis size for performing a Local Coherence Lanczos
+ */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+using namespace std;
+using namespace Grid;
+
+//For the CPS configurations we have to manually seed the RNG and deal with an incorrect factor of 2 in the plaquette metadata
+template<typename Gimpl>
+void readConfiguration(LatticeGaugeFieldD &U,
+		       const std::string &config,
+		       bool is_cps_cfg = false){
+
+  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = false;
+
+  typedef GaugeStatistics<Gimpl> GaugeStats;
+     
+  FieldMetaData header;
+  NerscIO::readConfiguration<GaugeStats>(U, header, config);
+
+  if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = true;
+}
+
+//Lanczos parameters in CPS conventions
+struct CPSLanczosParams : Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(CPSLanczosParams,
+				  RealD, alpha,
+				  RealD, beta,
+				  int, ch_ord,
+				  int, N_use,
+				  int, N_get,
+				  int, N_true_get,
+				  RealD, stop_rsd,
+				  int, maxits);
+
+  //Translations
+  ChebyParams getChebyParams() const{
+    ChebyParams out;
+    out.alpha = beta*beta; //aka lo
+    out.beta = alpha*alpha; //aka hi
+    out.Npoly = ch_ord+1;
+    return out;
+  }
+  int Nstop() const{ return N_true_get; }
+  int Nm() const{ return N_use; }
+  int Nk() const{ return N_get; }
+};
+
+
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceCompressor{
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<Fobj>                       FineField;
+  
+  void compress(std::vector<FineField> &basis,
+		std::vector<CoarseField> &compressed_evecs,
+		const std::vector<FineField> &evecs_in,
+		GridBase *FineGrid,
+		GridBase *CoarseGrid){
+    int nevecs = evecs_in.size();
+    assert(nevecs > nbasis);
+    
+    //Construct the basis
+    basis.resize(nbasis, FineGrid);
+    for(int b=0;b<nbasis;b++) basis[b] = evecs_in[b];
+
+    //Block othornormalize basis
+    CoarseScalar InnerProd(CoarseGrid);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,basis);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    blockOrthogonalise(InnerProd,basis);
+
+    //The coarse grid representation is the field of vectors of block inner products
+    std::cout << GridLogMessage << "Compressing eigevectors" << std::endl;
+    compressed_evecs.resize(nevecs, CoarseGrid);
+    for(int i=0;i<nevecs;i++) blockProject(compressed_evecs[i], evecs_in[i], basis);
+    std::cout << GridLogMessage << "Compression complete" << std::endl;
+  }
+
+  void uncompress(FineField &evec, const int i, const std::vector<FineField> &basis, const std::vector<CoarseField> &compressed_evecs) const{
+    blockPromote(compressed_evecs[i],evec,basis);  
+  }
+
+  //Test uncompressed eigenvectors of Linop.HermOp to precision 'base_tolerance' for i<nbasis and 'base_tolerance*relax' for i>=nbasis
+  //Because the uncompressed evec has a lot of high mode noise (unimportant for deflation) we apply a smoother before testing.
+  //The Chebyshev used by the Lanczos should be sufficient as a smoother
+  bool testCompression(LinearOperatorBase<FineField> &Linop, OperatorFunction<FineField>   &smoother,
+		       const std::vector<FineField> &basis, const std::vector<CoarseField> &compressed_evecs, const std::vector<RealD> &evals,
+		       const RealD base_tolerance, const RealD relax){
+    std::cout << GridLogMessage << "Testing quality of uncompressed evecs (after smoothing)" << std::endl;
+   
+    GridBase* FineGrid = basis[0].Grid();
+    GridBase* CoarseGrid = compressed_evecs[0].Grid();
+
+    bool fail = false;
+    FineField evec(FineGrid), Mevec(FineGrid), evec_sm(FineGrid);
+    for(int i=0;i<compressed_evecs.size();i++){
+      std::cout << GridLogMessage << "Uncompressing evec " << i << std::endl;
+      uncompress(evec, i, basis, compressed_evecs);
+
+      std::cout << GridLogMessage << "Smoothing evec " << i << std::endl;
+      smoother(Linop, evec, evec_sm);
+      
+      std::cout << GridLogMessage << "Computing residual for evec " << i << std::endl;
+      std::cout << GridLogMessage << "Linop" << std::endl;
+      Linop.HermOp(evec_sm, Mevec);
+      std::cout << GridLogMessage << "Linalg" << std::endl;
+      Mevec = Mevec - evals[i]*evec_sm;
+
+      std::cout << GridLogMessage << "Resid" << std::endl;
+      RealD tol = base_tolerance * (i<nbasis ? 1. : relax);
+      RealD res = sqrt(norm2(Mevec));
+      std::cout << GridLogMessage << "Evec idx " << i << " res " << res << " tol " << tol << std::endl;
+      if(res > tol) fail = true;
+    }
+    return fail;
+  }
+
+  //Compare uncompressed evecs to original evecs
+  void compareEvecs(const std::vector<FineField> &basis, const std::vector<CoarseField> &compressed_evecs, const std::vector<FineField> &orig_evecs){
+    std::cout << GridLogMessage << "Comparing uncompressed evecs to original evecs" << std::endl;
+    
+    GridBase* FineGrid = basis[0].Grid();
+    GridBase* CoarseGrid = compressed_evecs[0].Grid();
+
+    FineField evec(FineGrid), diff(FineGrid);
+    for(int i=0;i<compressed_evecs.size();i++){
+      std::cout << GridLogMessage << "Uncompressing evec " << i << std::endl;
+      uncompress(evec, i, basis, compressed_evecs);
+      diff = orig_evecs[i] - evec;
+      RealD res = sqrt(norm2(diff));
+      std::cout << GridLogMessage << "Evec idx " << i << " res " << res << std::endl;
+    }
+  }
+  
+};
+
+template<class Fobj,class CComplex,int nbasis>
+void compareBlockPromoteTimings(const std::vector<Lattice<Fobj> > &basis, const std::vector<Lattice<iVector<CComplex,nbasis > > > &compressed_evecs){
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CComplex>                   CoarseScalar; 
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<Fobj>                       FineField;
+
+  GridStopWatch timer;
+  
+  GridBase* FineGrid = basis[0].Grid();
+  GridBase* CoarseGrid = compressed_evecs[0].Grid();
+
+  FineField v1(FineGrid), v2(FineGrid);
+
+  //Start with a cold start
+  for(int i=0;i<basis.size();i++){
+    autoView( b_ , basis[i], CpuWrite);
+  }
+  for(int i=0;i<compressed_evecs.size();i++){
+    autoView( b_ , compressed_evecs[i], CpuWrite);
+  }
+  {
+    autoView( b_, v1, CpuWrite );
+  }
+
+  timer.Start();
+  blockPromote(compressed_evecs[0],v1,basis);  
+  timer.Stop();
+  std::cout << GridLogMessage << "Time for cold blockPromote v1 " << timer.Elapsed() << std::endl;
+
+  //Test to ensure it is actually doing a cold start by repeating
+  for(int i=0;i<basis.size();i++){
+    autoView( b_ , basis[i], CpuWrite);
+  }
+  for(int i=0;i<compressed_evecs.size();i++){
+    autoView( b_ , compressed_evecs[i], CpuWrite);
+  }
+  {
+    autoView( b_, v1, CpuWrite );
+  }
+
+  timer.Reset();
+  timer.Start();
+  blockPromote(compressed_evecs[0],v1,basis);  
+  timer.Stop();
+  std::cout << GridLogMessage << "Time for cold blockPromote v1 repeat (should be the same as above) " << timer.Elapsed() << std::endl;
+}
+
+struct Args{
+  int Ls;
+  RealD mass;
+  RealD M5;
+  bool is_cps_cfg;
+  RealD mobius_scale; //b+c
+  
+  CPSLanczosParams fine;
+  double coarse_relax_tol;
+
+  std::vector<int> blockSize;
+  std::vector<int> GparityDirs;
+
+  bool write_fine;
+  std::string write_fine_file;
+  bool read_fine;
+  std::string read_fine_file;
+
+  int basis_size;
+  
+  Args(){
+    blockSize = {2,2,2,2,2};
+    GparityDirs = {1,1,1}; //1 for each GP direction
+    
+    Ls = 12;
+    mass = 0.01;
+    M5 = 1.8;
+    is_cps_cfg = false;
+    mobius_scale = 2;
+    
+    fine.alpha = 2;
+    fine.beta = 0.1;
+    fine.ch_ord = 100;
+    fine.N_use = 70;
+    fine.N_get = 60;
+    fine.N_true_get = 60;
+    fine.stop_rsd = 1e-8;
+    fine.maxits = 10000;
+
+    coarse_relax_tol = 1e5;
+
+    write_fine = false;
+    read_fine = false;
+
+    basis_size = 100;
+  }
+};
+    
+
+GparityWilsonImplD::ImplParams setupGparityParams(const std::vector<int> &GparityDirs){
+  //Setup G-parity BCs
+  assert(Nd == 4);
+  std::vector<int> dirs4(4);
+  for(int i=0;i<3;i++) dirs4[i] = GparityDirs[i];
+  dirs4[3] = 0; //periodic gauge BC in time
+  
+  std::cout << GridLogMessage << "Gauge BCs: " << dirs4 << std::endl;
+  ConjugateGimplD::setDirections(dirs4); //gauge BC
+
+  GparityWilsonImplD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+  std::cout << GridLogMessage << "Fermion BCs: " << Params.twists << std::endl;
+  return Params;
+}
+
+WilsonImplD::ImplParams setupParams(){
+  WilsonImplD::ImplParams Params;
+  Complex one(1.0);
+  Complex mone(-1.0);
+  for(int i=0;i<Nd-1;i++) Params.boundary_phases[i] = one;
+  Params.boundary_phases[Nd-1] = mone;
+  return Params;
+}
+
+template<int nbasis, typename ActionType>
+void run_b(ActionType &action, const std::string &config, const Args &args){
+  //Fine grids
+  GridCartesian         * UGrid     = (GridCartesian*)action.GaugeGrid();
+  GridRedBlackCartesian * UrbGrid   = (GridRedBlackCartesian*)action.GaugeRedBlackGrid();
+  GridCartesian         * FGrid     = (GridCartesian*)action.FermionGrid();
+  GridRedBlackCartesian * FrbGrid   = (GridRedBlackCartesian*)action.FermionRedBlackGrid();
+
+  //Setup the coarse grids  
+  auto fineLatt     = GridDefaultLatt();
+  Coordinate coarseLatt(4);
+  for (int d=0;d<4;d++){
+    coarseLatt[d] = fineLatt[d]/args.blockSize[d];    assert(coarseLatt[d]*args.blockSize[d]==fineLatt[d]);
+  }
+
+  std::cout << GridLogMessage<< " 5d coarse lattice is ";
+  for (int i=0;i<4;i++){
+    std::cout << coarseLatt[i]<<"x";
+  } 
+  int cLs = args.Ls/args.blockSize[4]; assert(cLs*args.blockSize[4]==args.Ls);
+  std::cout << cLs<<std::endl;
+  
+  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
+  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
+  typedef vTComplex CComplex; 
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CComplex>                   CoarseScalar;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+
+  typedef typename ActionType::FermionField FermionField; 
+  
+  SchurDiagTwoOperator<ActionType,FermionField> SchurOp(action);
+
+  typedef typename ActionType::SiteSpinor SiteSpinor;
+
+  const CPSLanczosParams &fine = args.fine;
+  
+  //Do the fine Lanczos
+  std::vector<RealD> evals;
+  std::vector<FermionField> evecs;
+
+  if(args.read_fine){
+    evals.resize(fine.N_true_get);
+    evecs.resize(fine.N_true_get, FrbGrid);
+
+    std::string evals_file = args.read_fine_file + "_evals.xml";
+    std::string evecs_file = args.read_fine_file + "_evecs.scidac";
+    
+    std::cout << GridLogIRL<< "Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",evals);
+    
+    assert(evals.size()==fine.N_true_get);
+    
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<fine.N_true_get;k++) {
+      evecs[k].Checkerboard()=Odd;
+      RD.readScidacFieldRecord(evecs[k],record);
+      
+    }
+    RD.close();
+  }else{ 
+    int Nstop = fine.Nstop(); //==N_true_get
+    int Nm = fine.Nm();
+    int Nk = fine.Nk();
+    RealD resid = fine.stop_rsd;
+    int MaxIt = fine.maxits;
+    
+    assert(nbasis<=Nm);    
+    Chebyshev<FermionField>      Cheby(fine.getChebyParams());
+    FunctionHermOp<FermionField> ChebyOp(Cheby,SchurOp);
+    PlainHermOp<FermionField>    Op(SchurOp);
+
+    evals.resize(Nm);
+    evecs.resize(Nm,FrbGrid);
+    
+    ImplicitlyRestartedLanczos<FermionField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,0,0);
+
+    FermionField src(FrbGrid); 
+    typedef typename FermionField::scalar_type Scalar;
+    src=Scalar(1.0); 
+    src.Checkerboard() = Odd;
+
+    int Nconv;
+    IRL.calc(evals, evecs,src,Nconv,false);
+    if(Nconv < Nstop) assert(0 && "Fine lanczos failed to converge the required number of evecs"); //algorithm doesn't consider this a failure
+    if(Nconv > Nstop){
+      //Yes this potentially throws away some evecs but it is better than having a random number of evecs between Nstop and Nm!
+      evals.resize(Nstop);
+      evecs.resize(Nstop, FrbGrid);
+    }
+    
+    if(args.write_fine){
+      std::string evals_file = args.write_fine_file + "_evals.xml";
+      std::string evecs_file = args.write_fine_file + "_evecs.scidac";
+
+      std::cout << GridLogIRL<< "Writing evecs to "<<evecs_file<<std::endl;
+
+      emptyUserRecord record;
+      Grid::ScidacWriter WR(FrbGrid->IsBoss());
+      WR.open(evecs_file);
+      for(int k=0;k<evecs.size();k++) {
+	WR.writeScidacFieldRecord(evecs[k],record);
+      }
+      WR.close();
+
+      std::cout << GridLogIRL<< "Writing evals to "<<evals_file<<std::endl;
+      
+      XmlWriter WRx(evals_file);
+      write(WRx,"evals",evals);
+    }    
+  }
+    
+  //Do the compression
+  LocalCoherenceCompressor<SiteSpinor,vTComplex,nbasis> compressor;
+  std::vector<FermionField> basis(nbasis,FrbGrid);
+  std::vector<CoarseField> compressed_evecs(evecs.size(),CoarseGrid5);
+  
+  compressor.compress(basis, compressed_evecs, evecs, FrbGrid, CoarseGrid5);
+
+  compareBlockPromoteTimings(basis, compressed_evecs);
+
+  //Compare uncompressed and original evecs
+  compressor.compareEvecs(basis, compressed_evecs, evecs);
+  
+  //Create the smoother
+  Chebyshev<FermionField> smoother(fine.getChebyParams());
+  
+  //Test the quality of the uncompressed evecs
+  assert( compressor.testCompression(SchurOp, smoother, basis, compressed_evecs, evals, fine.stop_rsd, args.coarse_relax_tol) );   
+}
+
+template<typename ActionType>
+void run(ActionType &action, const std::string &config, const Args &args){
+  switch(args.basis_size){
+  case 50:
+    return run_b<50>(action,config,args);
+  case 100:
+    return run_b<100>(action,config,args);
+  case 150:
+    return run_b<150>(action,config,args);
+  case 200:
+    return run_b<200>(action,config,args);
+  case 250:
+    return run_b<250>(action,config,args);
+  case 300:
+    return run_b<300>(action,config,args);
+  case 350:
+    return run_b<350>(action,config,args);
+  case 400:
+    return run_b<400>(action,config,args);
+  default:
+    assert(0 && "Unsupported basis size: allowed values are 50,100,200,250,300,350,400");
+  }
+}
+
+
+
+
+//Note:  because we rely upon physical properties we must use a "real" gauge configuration
+int main (int argc, char ** argv) {
+  Grid_init(&argc,&argv);
+  GridLogIRL.TimingMode(1);
+
+  if(argc < 3){
+    std::cout << GridLogMessage << "Usage: <exe> <config file> <gparity dirs> <options>" << std::endl;
+    std::cout << GridLogMessage << "<gparity dirs> should have the format a.b.c where a,b,c are 0,1 depending on whether there are G-parity BCs in that direction" << std::endl;
+    std::cout << GridLogMessage << "Options:" << std::endl;
+    std::cout << GridLogMessage << "--Ls <value> : Set Ls (default 12)" << std::endl;
+    std::cout << GridLogMessage << "--mass <value> : Set the mass (default 0.01)" << std::endl;
+    std::cout << GridLogMessage << "--block <value> : Set the block size. Format should be a.b.c.d.e where a-e are the block extents  (default 2.2.2.2.2)" << std::endl;
+    std::cout << GridLogMessage << "--is_cps_cfg : Indicate that the configuration was generated with CPS where until recently the stored plaquette was wrong by a factor of 2" << std::endl;
+    std::cout << GridLogMessage << "--write_irl_templ: Write a template for the parameters file of the Lanczos to \"irl_templ.xml\"" << std::endl;
+    std::cout << GridLogMessage << "--read_irl_fine <filename>: Real the parameters file for the fine Lanczos" << std::endl;
+    std::cout << GridLogMessage << "--write_fine <filename stub>: Write fine evecs/evals to filename starting with the stub" << std::endl;
+    std::cout << GridLogMessage << "--read_fine <filename stub>: Read fine evecs/evals from filename starting with the stub" << std::endl;    
+    std::cout << GridLogMessage << "--coarse_relax_tol : Set the relaxation parameter for evaluating the residual of the reconstructed eigenvectors outside of the basis (default 1e5)" << std::endl;
+    std::cout << GridLogMessage << "--action : Set the action from 'DWF', 'Mobius'  (default Mobius)" << std::endl;
+    std::cout << GridLogMessage << "--mobius_scale : Set the Mobius scale b+c (default 2)" << std::endl;
+    std::cout << GridLogMessage << "--basis_size : Set the basis size from 50,100,150,200,250,300,350,400 (default 100)" << std::endl;
+
+    Grid_finalize();
+    return 1;
+  }
+  std::string config = argv[1];
+
+  Args args;
+  GridCmdOptionIntVector(argv[2], args.GparityDirs);
+  assert(args.GparityDirs.size() == 3);
+
+  std::string action_s = "Mobius"; 
+  
+  for(int i=3;i<argc;i++){
+    std::string sarg = argv[i];
+    if(sarg == "--Ls"){
+      args.Ls = std::stoi(argv[i+1]);
+      std::cout << GridLogMessage << "Set Ls to " << args.Ls << std::endl;
+    }else if(sarg == "--mass"){
+      std::istringstream ss(argv[i+1]); ss >> args.mass;
+      std::cout << GridLogMessage << "Set quark mass to " << args.mass << std::endl;
+    }else if(sarg == "--block"){
+      GridCmdOptionIntVector(argv[i+1], args.blockSize);
+      assert(args.blockSize.size() == 5);
+      std::cout << GridLogMessage << "Set block size to ";
+      for(int q=0;q<5;q++) std::cout << args.blockSize[q] << " ";
+      std::cout << std::endl;      
+    }else if(sarg == "--is_cps_cfg"){
+      args.is_cps_cfg = true;
+    }else if(sarg == "--write_irl_templ"){
+      XmlWriter writer("irl_templ.xml");
+      write(writer,"Params",args.fine);
+      Grid_finalize();
+      return 0;
+    }else if(sarg == "--read_irl_fine"){
+      std::cout << GridLogMessage << "Reading fine IRL params from " << argv[i+1] << std::endl;
+      XmlReader reader(argv[i+1]);
+      read(reader, "Params", args.fine);
+    }else if(sarg == "--write_fine"){
+      args.write_fine = true;
+      args.write_fine_file = argv[i+1];
+    }else if(sarg == "--read_fine"){
+      args.read_fine = true;
+      args.read_fine_file = argv[i+1];
+    }else if(sarg == "--coarse_relax_tol"){
+      std::istringstream ss(argv[i+1]); ss >> args.coarse_relax_tol;
+      std::cout << GridLogMessage << "Set coarse IRL relaxation parameter to " << args.coarse_relax_tol << std::endl;
+    }else if(sarg == "--action"){
+      action_s = argv[i+1];
+      std::cout << "Action set to " << action_s << std::endl;
+    }else if(sarg == "--mobius_scale"){
+      std::istringstream ss(argv[i+1]); ss >> args.mobius_scale;
+      std::cout << GridLogMessage << "Set Mobius scale to " << args.mobius_scale << std::endl;
+    }else if(sarg == "--basis_size"){
+      args.basis_size = std::stoi(argv[i+1]);
+      std::cout << GridLogMessage << "Set basis size to " << args.basis_size << std::endl;
+    }
+  }
+  
+  //Fine grids
+  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),  GridDefaultSimd(Nd,vComplex::Nsimd()),   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(args.Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(args.Ls,UGrid);
+
+  LatticeGaugeField Umu(UGrid);  
+  
+  bool is_gparity = false;
+  for(auto g : args.GparityDirs) if(g) is_gparity = true;
+
+  double bmc =  1.;      
+  double b = (args.mobius_scale + bmc)/2.;  // b = 1/2 [ (b+c) + (b-c) ]
+  double c = (args.mobius_scale - bmc)/2.;  // c = 1/2 [ (b+c) - (b-c) ]
+    
+  if(is_gparity){
+    GparityWilsonImplD::ImplParams Params = setupGparityParams(args.GparityDirs);
+    readConfiguration<ConjugateGimplD>(Umu, config, args.is_cps_cfg);   //Read the gauge field
+    
+    if(action_s == "DWF"){    
+      GparityDomainWallFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, Params);
+      run(action, config, args);
+    }else if(action_s == "Mobius"){
+      GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params);
+      run(action, config, args);	    
+    }      
+  }else{
+    WilsonImplD::ImplParams Params = setupParams();
+    readConfiguration<PeriodicGimplD>(Umu, config, args.is_cps_cfg);   //Read the gauge field
+    
+    if(action_s == "DWF"){    
+      DomainWallFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, Params);
+      run(action, config, args);
+    }else if(action_s == "Mobius"){
+      MobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params);
+      run(action, config, args);	    
+    }
+  } 
+  
+  Grid_finalize();
+}
Author	SHA1	Message	Date
Peter Boyle	e7d9b75fdd	Warning fixes	2022-08-31 19:01:14 -04:00
Peter Boyle	3d0e3ec363	Tracing	2022-08-31 18:31:46 -04:00
Peter Boyle	3c1c51f9aa	Merge branch 'feature/dirichlet-gparity' into feature/dirichlet	2022-08-31 18:25:34 -04:00
Peter Boyle	8cc3c522c3	Merge pull request #409 from giltirn/feature/dirichlet-gparity-stage Import round 5	2022-08-31 18:22:50 -04:00
Peter Boyle	5c87342108	Used in g-2 sign off	2022-08-31 17:35:32 -04:00
Peter Boyle	66177bfbe2	Used in g-2 sign off	2022-08-31 17:35:07 -04:00
Peter Boyle	5205e68963	RocTX, NVTX, text based self profiling	2022-08-31 17:34:09 -04:00
Peter Boyle	cd5cf6d614	Tracing replaces self timing hooks	2022-08-31 17:33:41 -04:00
Peter Boyle	5abb19eab0	Remove self timing	2022-08-31 17:32:49 -04:00
Peter Boyle	06d7b88c78	Force reporting improved	2022-08-31 17:32:21 -04:00
Peter Boyle	cf72799735	Better action naming	2022-08-31 17:24:11 -04:00
Peter Boyle	cdb8fcc269	Width=4 support. This is too broad; hit it on physical point run. Need to change strategy, I think.	2022-08-31 17:21:33 -04:00
Peter Boyle	b4f4130901	Defer SMP node links until after interior. Allows for DMA overlapping compute	2022-08-31 17:20:21 -04:00
Peter Boyle	bb049847d5	Tracing replaces self timing	2022-08-31 17:19:02 -04:00
Peter Boyle	fd33c835dd	Feynman rule fix and tracing replaces self timing	2022-08-31 17:18:17 -04:00
Peter Boyle	21371a7e5b	Tracing replaces self timing	2022-08-31 17:16:05 -04:00
Peter Boyle	abfaa00d3e	Tracing replaces self timing	2022-08-31 17:15:24 -04:00
Peter Boyle	efee33c55d	Tracing replaces self timing	2022-08-31 17:14:57 -04:00
Peter Boyle	db0fe6ddbb	Tracing replaces self timinng	2022-08-31 17:14:14 -04:00
Peter Boyle	8a9e647120	Tracing replaces self timing	2022-08-31 17:13:44 -04:00
Peter Boyle	e6dcb821ad	Tracing replaces self timing	2022-08-31 17:12:31 -04:00
Peter Boyle	9bff188f02	Tracing replaces self timing	2022-08-31 17:12:05 -04:00
Peter Boyle	111b30ca1d	Tracing replaces self timing	2022-08-31 17:11:48 -04:00
Peter Boyle	24182ca8bf	HIP allows conserved currents. Tracing replaces self timeing	2022-08-31 17:11:18 -04:00
Peter Boyle	ee2d7369b3	Tracing replaces self timing	2022-08-31 17:10:45 -04:00
Peter Boyle	7c686d29c9	Tracing replaces self timing	2022-08-31 17:10:17 -04:00
Peter Boyle	e8a0a1e75d	Tracing replaces self timing hooks	2022-08-31 17:09:47 -04:00
Peter Boyle	730be89abf	Remove timing hooks as tracing replaces	2022-08-31 17:08:44 -04:00
Peter Boyle	f991ad7d5c	Remove timing hooks as tracing replaces	2022-08-31 17:08:18 -04:00
Peter Boyle	b3f33f82f7	Decrease self timing hooks, use nvtx / roctx type tracing hooks instead	2022-08-31 17:06:47 -04:00
Peter Boyle	a34a6e059f	Logging improvement. Sinitial will be used to improve RHMC terms	2022-08-31 17:06:08 -04:00
Peter Boyle	1333319941	Tracing	2022-08-31 17:00:25 -04:00
Peter Boyle	9295ed8d20	Print full memory range	2022-08-31 16:59:51 -04:00
Peter Boyle	19cc7653fb	Tracing	2022-08-31 16:57:51 -04:00
Peter Boyle	5752538661	Tracing	2022-08-31 16:57:32 -04:00
Peter Boyle	ca40a1b00b	Tracing	2022-08-31 16:54:55 -04:00
Peter Boyle	659fac9dfb	Tracing hook	2022-08-31 16:54:25 -04:00
Peter Boyle	4dc3d6fce0	Buy into Nvidia/Rocm etc... tracing.	2022-08-31 16:53:19 -04:00
Christopher Kelly	33e4a0caee	Imported changes from feature/gparity_HMC branch: Rework of WilsonFlow class Fixed logic error in smear method where the step index was initialized to 1 rather than 0, resulting in the logged output value of tau being too large by epsilon Previously smear_adaptive would maintain the current value of tau as a class member variable whereas smear would compute it separately; now both methods maintain the current value internally and it is updated by the evolve_step routines. Both evolve methods are now const. smear_adaptive now also maintains the current value of epsilon internally, allowing it to be a const method and also allowing the same class instance to be reused without needing to be reset Replaced the fixed evaluation of the plaquette energy density and plaquette topological charge during the smearing with a highly flexible general strategy where the user can add arbitrary measurements as functional objects that are evaluated at an arbitrary frequency By default the same plaquette-based measurements are performed, but additional example functions are provided where the smearing is performed with different choices of measurement that are returned as an array for further processing Added a method to compute the energy density using the Cloverleaf approach which has smaller discretization errors Added a new tensor utility operation, copyLane, which allows for the copying of a single SIMD lane between two instances of the same tensor type but potentially different precisions To LocalCoherenceLanczos, added the option to compute the high/low eval of the fine operator on every restart to aid in tuning the Chebyshev Added Test_field_array_io which demonstrates and tests a single-file write of an arbitrary array of fields Added Test_evec_compression which generates evecs using Lanczos and attempts to compress them using the local coherence technique Added Test_compressed_lanczos_gparity which demonstrates the local coherence Lanczos for G-parity BCs Added HMC main programs for the 40ID and 48ID G-parity lattices	2022-07-01 14:12:12 -04:00