Warning fixes

Tracing
Merge branch 'feature/dirichlet-gparity' into feature/dirichlet
2025-06-23 18:22:02 +01:00 · 2022-08-31 19:01:14 -04:00 · 2022-08-31 18:31:46 -04:00 · 2022-08-31 18:25:34 -04:00 · 2022-08-31 17:35:32 -04:00 · 2022-08-31 17:35:07 -04:00
59 changed files with 1817 additions and 903 deletions
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@ -44,7 +44,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
-#include <Grid/perfmon/PerfCount.h>
+#include <Grid/perfmon/Tracing.h>
+//#include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
 #include <Grid/allocator/Allocator.h>
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -58,6 +58,7 @@ public:

  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {

+    GRID_TRACE("ConjugateGradient");
    psi.Checkerboard() = src.Checkerboard();

    conformable(psi, src);
@ -117,6 +118,7 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;

+    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
@ -166,14 +168,16 @@ public:

      // Stopping condition
      if (cp <= rsq) {
+	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-
+	GridBase *grid = src.Grid();
+	RealD DwfFlops = (1452. )*grid->gSites()*4*k
+   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
-
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
@ -187,6 +191,8 @@ public:
 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

+	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
+
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

 	IterationsToComplete = k;	
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -84,6 +84,7 @@ public:

  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
  {
+    GRID_TRACE("ConjugateGradientMultiShift");
  
    GridBase *grid = src.Grid();
  
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -127,6 +127,7 @@ public:

  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
+    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();

    ////////////////////////////////////////////////////////////////////////
--- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
@ -73,6 +73,7 @@ public:
  }
    
  void operator()(const FieldD &src, FieldD &psi) {
+    GRID_TRACE("ConjugateGradientReliableUpdate");
    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
    bool using_fallback = false;
      
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -40,7 +40,7 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -3,8 +3,14 @@

 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
-#define dprintf(...)
+
+#define MAXLINE 512
+static char print_buffer [ MAXLINE ];
+
+#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
+//#define dprintf(...) printf (__VA_ARGS__ ); fflush(stdout);
+#define dprintf(...) 
+


 ////////////////////////////////////////////////////////////
@ -104,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -112,7 +118,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -126,7 +132,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Evict(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
@ -137,7 +143,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -150,7 +156,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -165,7 +171,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@ -241,7 +247,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
+    dprintf("ViewOpen found entry %lx %lx : %ld %ld\n",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -107,6 +107,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
+  static void BarrierWorld(void);
  
  ////////////////////////////////////////////////////////////
  // Reduction
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -396,17 +396,17 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    }
  }
  
-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
-    this->StencilSendToRecvFromComplete(list,dir);
-    list.resize(0);
-  }
-
+  /*  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+   *    this->StencilSendToRecvFromComplete(list,dir);
+   *    list.resize(0);
+   *  }
+   */
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
+  StencilBarrier();// Synch shared memory on a single nodes

  int nreq=list.size();

@ -443,6 +443,10 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
+void CartesianCommunicator::BarrierWorld(void){
+  int ierr = MPI_Barrier(communicator_world);
+  assert(ierr==0);
+}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -104,6 +104,7 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -523,7 +523,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  }
  if ( WorldRank == 0 ){
    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  std::cout<< "Setting up IPC"<<std::endl;
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
@ -53,6 +54,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -70,6 +72,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -86,6 +89,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -106,6 +110,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@ -119,6 +124,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -133,6 +139,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -146,6 +153,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@ -163,6 +171,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mult");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -177,6 +186,7 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("mac");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -191,6 +201,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("sub");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -204,6 +215,7 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  GRID_TRACE("add");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -218,6 +230,7 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class sobj,class vobj> inline
 void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+  GRID_TRACE("axpy");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@ -231,6 +244,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 }
 template<class sobj,class vobj> inline
 void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+  GRID_TRACE("axpby");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@ -246,11 +260,13 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
+  GRID_TRACE("axpy_norm");
    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
+  GRID_TRACE("axpby_norm");
    return axpby_norm_fast(ret,a,b,x,y);
 }

--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -117,6 +117,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
+    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@ -140,6 +141,7 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
+    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@ -163,6 +165,7 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
+    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -488,6 +488,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
+template<class vobj> inline
+std::vector<typename vobj::scalar_object> 
+sliceSum(const Lattice<vobj> &Data,int orthogdim)
+{
+  std::vector<typename vobj::scalar_object> result;
+  sliceSum(Data,result,orthogdim);
+  return result;
+}

 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@ -68,6 +68,7 @@ GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
+GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
@ -80,6 +81,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
+  GridLogDslash.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
@ -91,6 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@ -138,7 +138,8 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left;
+      //      stream << log.colour() <<  std::left;
+      stream <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
@ -153,9 +154,9 @@ public:
 	stream << log.evidence()
 	       << now	       << log.background() << " : " ;
      }
-      stream << log.colour();
+      //      stream << log.colour();
+      stream <<  std::right;
      stream.flags(f);
-
      return stream;
    } else { 
      return devnull;
@ -180,6 +181,7 @@ extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
+extern GridLogger GridLogDslash;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
--- a/Grid/perfmon/PerfCount.cc
+++ b/Grid/perfmon/PerfCount.cc
@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */

 #include <Grid/GridCore.h>
-#include <Grid/perfmon/PerfCount.h>

+#include <Grid/perfmon/Timer.h>
+#include <Grid/perfmon/PerfCount.h>
 NAMESPACE_BEGIN(Grid);

+GridTimePoint theProgramStart = GridClock::now();
+
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid)

-// Dress the output; use std::chrono
-// C++11 time facilities better?
-inline double usecond(void) {
-  struct timeval tv;
-  tv.tv_sec = 0;
-  tv.tv_usec = 0;
-  gettimeofday(&tv,NULL);
-  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
-}
-
-typedef  std::chrono::system_clock          GridClock;
+//typedef  std::chrono::system_clock          GridClock;
+typedef  std::chrono::high_resolution_clock   GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;

 typedef  std::chrono::seconds               GridSecs;
@ -53,6 +44,15 @@ typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;

+extern GridTimePoint theProgramStart;
+// Dress the output; use std::chrono
+// C++11 time facilities better?
+inline double usecond(void) {
+  auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart); 
+  return 1.0*usecs.count();
+}
+
+
 inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
  stream << time.count()<<" s";
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@ -0,0 +1,66 @@
+#pragma once
+#ifdef GRID_TRACING_NVTX
+#include <nvToolsExt.h>
+class GridTracer {
+public:
+  GridTracer(const char* name) {
+    nvtxRangePushA(name);
+  }
+  ~GridTracer() {
+    nvtxRangePop();
+  }
+};
+inline void tracePush(const char *name) { nvtxRangePushA(name); }
+inline void tracePop(const char *name) { nvtxRangePop(); }
+inline int  traceStart(const char *name) {  }
+inline void traceStop(int ID) {  }
+#endif
+
+#ifdef GRID_TRACING_ROCTX
+#include <roctracer/roctx.h>
+class GridTracer {
+ public:
+  GridTracer(const char* name) {
+    roctxRangePushA(name);
+    std::cout << "roctxRangePush "<<name<<std::endl;
+  }
+  ~GridTracer() {
+    roctxRangePop();
+    std::cout << "roctxRangePop "<<std::endl;
+  }
+};
+inline void tracePush(const char *name) { roctxRangePushA(name); }
+inline void tracePop(const char *name) { roctxRangePop(); }
+inline int  traceStart(const char *name) { roctxRangeStart(name); }
+inline void traceStop(int ID) { roctxRangeStop(ID); }
+#endif
+
+#ifdef GRID_TRACING_TIMER
+class GridTracer {
+ public:
+  const char *name;
+  double elapsed;
+  GridTracer(const char* _name) {
+    name = _name;
+    elapsed=-usecond();
+  }
+  ~GridTracer() {
+    elapsed+=usecond();
+    std::cout << GridLogTracing << name << " took " <<elapsed<< " us" <<std::endl;
+  }
+};
+inline void tracePush(const char *name) {  }
+inline void tracePop(const char *name) {  }
+inline int  traceStart(const char *name) { return 0; }
+inline void traceStop(int ID) {  }
+#endif
+
+#ifdef GRID_TRACING_NONE
+#define GRID_TRACE(name) 
+inline void tracePush(const char *name) {  }
+inline void tracePop(const char *name) {  }
+inline int  traceStart(const char *name) { return 0;  }
+inline void traceStop(int ID) {  }
+#else
+#define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
+#endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -42,6 +42,8 @@ public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
+  RealD Fdt_norm_sum;
+  RealD Fdt_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
@ -50,13 +52,21 @@ public:
    deriv_us = S_us = refresh_us = 0.0;
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
+    Fdt_max_sum =  Fdt_norm_sum = 0.0;
  }
-  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
-  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
-  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
+  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
+    deriv_max_sum+=max;
+    deriv_norm_sum+=nrm;
+    Fdt_max_sum+=Fdt_max;
+    Fdt_norm_sum+=Fdt_nrm; deriv_num++;
+  }
+  RealD deriv_max_average(void)       { return deriv_max_sum/deriv_num; };
+  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
+  RealD Fdt_max_average(void)         { return Fdt_max_sum/deriv_num; };
+  RealD Fdt_norm_average(void)        { return Fdt_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
-  RealD S_timer(void)            { return deriv_us; };
-  RealD refresh_timer(void)      { return deriv_us; };
+  RealD S_timer(void)            { return S_us; };
+  RealD refresh_timer(void)      { return refresh_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
@ -66,6 +76,7 @@ public:
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
+  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -39,7 +39,7 @@ struct GparityWilsonImplParams {
  Coordinate twists;
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {};
+  GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); };
 };
  
 struct WilsonImplParams {
@ -48,13 +48,13 @@ struct WilsonImplParams {
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
  }
 };

@ -62,7 +62,7 @@ struct StaggeredImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  StaggeredImplParams()
  {
-    dirichlet.resize(Nd,0);
+    dirichlet.resize(0);
  };
 };
  
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -183,16 +183,6 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-  void CayleyReport(void);
-  void CayleyZeroCounters(void);
-
-  double M5Dflops;
-  double M5Dcalls;
-  double M5Dtime;
-
-  double MooeeInvFlops;
-  double MooeeInvCalls;
-  double MooeeInvTime;

 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  ////////////////////////////////////////
-  // Performance monitoring
-  ////////////////////////////////////////
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopTotalTime;
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -52,18 +52,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  ////////////////////////////////////////
-  // Performance monitoring
-  ////////////////////////////////////////
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopTotalTime;
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  ////////////////////////////////////////
-  // Performance monitoring
-  ////////////////////////////////////////
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopTotalTime;
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -294,11 +294,7 @@ public:
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;

-  void ZeroCountersi(void)  {  }
-  void Reporti(int calls)  {  }
-
  //  Vector<int> surface_list;
-
  WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
@ -306,7 +302,6 @@ public:
 		const std::vector<int> &distances,Parameters p)  
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
-    ZeroCountersi();
    //    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
@ -400,7 +395,6 @@ public:
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
-    accelerator_barrier();
  }

 };
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -74,20 +74,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  void Report(void);
-  void ZeroCounters(void);
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-  double DhopTotalTime;
-
-  double DerivCalls;
-  double DerivCommTime;
-  double DerivComputeTime;
-  double DerivDhopComputeTime;
-
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -78,21 +78,6 @@ public:
  int Dirichlet;
  Coordinate Block; 

-  /********** Deprecate timers **********/
-  void Report(void);
-  void ZeroCounters(void);
-  double DhopCalls;
-  double DhopCommTime;
-  double DhopComputeTime;
-  double DhopComputeTime2;
-  double DhopFaceTime;
-  double DhopTotalTime;
-
-  double DerivCalls;
-  double DerivCommTime;
-  double DerivComputeTime;
-  double DerivDhopComputeTime;
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -152,58 +152,6 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  }
 }

-template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
-{
-  this->Report();
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP     = this->_FourDimGrid->_Nprocessors;
-  if ( M5Dcalls > 0 ) {
-    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
-
-    // Flops = 10.0*(Nc*Ns) *Ls*vol
-    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-
-    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
-    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
-    // write = 1
-    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
-    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
-  }
-
-  if ( MooeeInvCalls > 0 ) {
-
-    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-#ifdef GRID_CUDA
-    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#else
-    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
-    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-#endif
-  }
-
-}
-template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
-{
-  this->ZeroCounters();
-  M5Dflops=0;
-  M5Dcalls=0;
-  M5Dtime=0;
-  MooeeInvFlops=0;
-  MooeeInvCalls=0;
-  MooeeInvTime=0;
-}
-
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
@ -646,7 +594,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
  
-#if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@ -765,7 +712,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
    else          q_out +=     C;
    
  }
-#endif
+
 }

 template <class Impl>
@ -832,7 +779,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif

-#if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
@ -952,7 +898,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

    InsertSlice(L_Q, q_out, s , 0);
  }
-#endif
 }
 #undef Pp
 #undef Pm
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -63,9 +63,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,

  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t s = sss%Ls;
@ -78,7 +75,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
    spProj5p(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
  });
-  M5Dtime+=usecond();
 }

 template<class Impl>  
@ -104,9 +100,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  int Ls=this->Ls;

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t s = sss%Ls;
@ -119,7 +112,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
    spProj5m(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
  });
-  M5Dtime+=usecond();
 }

 template<class Impl>
@ -140,8 +132,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  auto pleem = & leem[0];
  auto pueem = & ueem[0];

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -178,8 +168,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
-
-  MooeeInvTime+=usecond();
  
 }

@ -202,10 +190,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi

  assert(psi.Checkerboard() == psi.Checkerboard());

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
-
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -242,7 +226,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
-  MooeeInvTime+=usecond();

 }

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@ -94,10 +94,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}

-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
  assert(Nc==3);

  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
@ -198,7 +194,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
    }
 #endif
  });
-  M5Dtime+=usecond();
 }

 template<class Impl>  
@ -242,8 +237,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}

-  M5Dcalls++;
-  M5Dtime-=usecond();
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
    alignas(64) SiteHalfSpinor hp;
@ -339,7 +332,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
    }
 #endif
  });
-  M5Dtime+=usecond();
 }


@ -813,9 +805,6 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
  }
  assert(_Matp->size()==Ls*LLs);

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
  if ( switcheroo<Coeff_t>::iscomplex() ) {
    thread_loop( (auto site=0;site<vol;site++),{
      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
@ -825,7 +814,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    });
  }
-  MooeeInvTime+=usecond();
+
 }

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -54,8 +54,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  auto pupper = &upper[0];
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
  
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -71,7 +69,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -91,8 +88,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  auto plower = &lower[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();

  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -108,7 +103,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -127,8 +121,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  auto pleem = & this->leem[0];
  auto pueem = & this->ueem[0];

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -164,7 +156,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
      coalescedWrite(chi[ss+s],res);
    }
  });
-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -185,8 +176,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion

  assert(psi.Checkerboard() == psi.Checkerboard());

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
  auto nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -223,7 +212,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
    }
  });

-  this->MooeeInvTime += usecond();
 }

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@ -298,45 +298,33 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();

-  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
-  DhopFaceTime+=usecond();

-  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
-  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  DhopFaceTime+=usecond();

  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime-=usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime+=usecond();

-  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();

  st.CommunicateComplete(requests);
-  DhopCommTime +=usecond();

-  DhopComputeTime2-=usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime2+=usecond();
 }

 template<class Impl>
@ -347,22 +335,14 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];

- //double t1=usecond();
-  DhopTotalTime -= usecond();
-  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
-  DhopCommTime += usecond();
  
-  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
-
 }
 /*CHANGE END*/

@ -371,7 +351,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -383,7 +362,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -395,7 +373,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());

@ -404,58 +381,6 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }

-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Report(void) 
-{
-  Coordinate latt = GridDefaultLatt();          
-  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _FourDimGrid->_Nprocessors;
-  RealD NN = _FourDimGrid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _FourDimGrid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime    = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@ -334,7 +334,6 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@ -346,7 +345,6 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -359,7 +357,6 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -418,47 +415,33 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  Compressor compressor; 
  int len =  U.Grid()->oSites();

-  DhopTotalTime   -= usecond();
-
-  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
-  DhopFaceTime    += usecond();

-  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

-  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
-  DhopFaceTime+= usecond();

  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime    += usecond();

  st.CommunicateComplete(requests);
-  DhopCommTime +=usecond();

  // First to enter, last to leave timing
-  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
-  DhopFaceTime    -= usecond();

-  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime2    += usecond();
 }


@ -471,78 +454,16 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));

-  DhopTotalTime   -= usecond();
-
-  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
-  DhopCommTime    += usecond();

-  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
 };

-  ////////////////////////////////////////////////////////////////
-  // Reporting
-  ////////////////////////////////////////////////////////////////
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::Report(void) 
-{
-  Coordinate latt = _grid->GlobalDimensions();
-  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _grid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime   = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -55,9 +55,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  auto plower = &lower[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -73,7 +70,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -99,9 +95,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  auto pshift_coeffs = &shift_coeffs[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -122,7 +115,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -143,9 +135,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  auto plower = &lower[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(), {
    uint64_t ss = sss*Ls;
@ -161,8 +150,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });
-
-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -186,9 +173,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  auto pshift_coeffs = &shift_coeffs[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  this->M5Dcalls++;
-  this->M5Dtime -= usecond();
-
  auto pm = this->pm;

  int nloop = grid->oSites()/Ls;
@ -217,7 +201,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
    }
  });

-  this->M5Dtime += usecond();
 }

 template<class Impl>
@ -237,9 +220,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &

  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -277,7 +257,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
    }
  });
   
-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -297,8 +276,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  auto pueem= & this->ueem[0];
  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -343,7 +320,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
      }
  });

-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -363,9 +339,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  auto pleem= & this->leem[0];
  auto pueem= & this->ueem[0];

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -402,7 +375,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
      coalescedWrite(chi[ss+s],res);
    }
  });
-  this->MooeeInvTime += usecond();
 }

 template<class Impl>
@ -423,9 +395,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];

-  this->MooeeInvCalls++;
-  this->MooeeInvTime -= usecond();
-
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
      uint64_t ss=sss*Ls;
@ -469,7 +438,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
      }
  });

-  this->MooeeInvTime += usecond();
 }

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@ -263,7 +263,6 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@ -275,7 +274,6 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -288,7 +286,6 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
-  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -345,47 +342,33 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  Compressor compressor; 
  int len =  U.Grid()->oSites();

-  DhopTotalTime   -= usecond();
-
-  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
-  DhopFaceTime    += usecond();

-  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);

-  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
-  DhopFaceTime+= usecond();

  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
-  DhopComputeTime    += usecond();

  st.CommunicateComplete(requests);
-  DhopCommTime +=usecond();

  // First to enter, last to leave timing
-  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
-  DhopFaceTime    -= usecond();

-  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
-  DhopComputeTime2    += usecond();
 }

 template <class Impl>
@ -396,78 +379,16 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));

-  DhopTotalTime   -= usecond();
-
-  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
-  DhopCommTime    += usecond();

-  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
-  DhopComputeTime += usecond();
-  DhopTotalTime   += usecond();
 };

-  ////////////////////////////////////////////////////////////////
-  // Reporting
-  ////////////////////////////////////////////////////////////////
-template<class Impl>
-void NaiveStaggeredFermion<Impl>::Report(void) 
-{
-  Coordinate latt = _grid->GlobalDimensions();
-  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-
-  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-
-  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
-	    << DhopCalls   << std::endl;
-  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
-	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
-	    << DhopCommTime    / DhopCalls << " us" << std::endl;
-  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
-	    << DhopComputeTime / DhopCalls << " us" << std::endl;
-
-  // Average the compute time
-  _grid->GlobalSum(DhopComputeTime);
-  DhopComputeTime/=NP;
-
-  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-  
-  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-}
-template<class Impl>
-void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
-{
-  DhopCalls       = 0;
-  DhopTotalTime   = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopFaceTime    = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-}
-
-
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -103,8 +103,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    Block = block;
  }

-  ZeroCounters();
-
  if (Impl::LsVectorised) { 

    int nsimd = Simd::Nsimd();
@ -143,89 +141,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
   //                       <<" " << StencilEven.surface_list.size()<<std::endl;

 }
-     
-template<class Impl>
-void WilsonFermion5D<Impl>::Report(void)
-{
-  RealD NP     = _FourDimGrid->_Nprocessors;
-  RealD NN     = _FourDimGrid->NodeCount();
-  RealD volume = Ls;  
-  Coordinate latt = _FourDimGrid->GlobalDimensions();
-  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-
-  if ( DhopCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls   : " << DhopCalls   << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
-
-    // Average the compute time
-    _FourDimGrid->GlobalSum(DhopComputeTime);
-    DhopComputeTime/=NP;
-    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-
-    RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-   }
-
-  if ( DerivCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
-    
-    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-
-    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
-
-  if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
-  }
-  if ( DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
-  }
-}
-
-template<class Impl>
-void WilsonFermion5D<Impl>::ZeroCounters(void) {
-  DhopCalls       = 0;
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopComputeTime2= 0;
-  DhopFaceTime    = 0;
-  DhopTotalTime   = 0;
-
-  DerivCalls       = 0;
-  DerivCommTime    = 0;
-  DerivComputeTime = 0;
-  DerivDhopComputeTime = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-  Stencil.ZeroCountersi();
-  StencilEven.ZeroCountersi();
-  StencilOdd.ZeroCountersi();
-}
-

 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -233,10 +148,10 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
-    std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
+    std::cout << GridLogDslash << " Dirichlet BCs 5d " <<Block<<std::endl;
    Coordinate GaugeBlock(Nd);
    for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
-    std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
+    std::cout << GridLogDslash << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
    DirichletFilter<GaugeField> Filter(GaugeBlock);
    Filter.applyFilter(HUmu);
  }
@ -281,7 +196,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  const FermionField &B,
 					  int dag)
 {
-  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));

  conformable(st.Grid(),A.Grid());
@ -292,15 +206,12 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());

-  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
-  DerivCommTime+=usecond();

  Atilde=A;
  int LLs = B.Grid()->_rdimensions[0];


-  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
@ -312,8 +223,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    // Call the single hop
    ////////////////////////

-    DerivDhopComputeTime -= usecond();
-
    int Usites = U.Grid()->oSites();

    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
@ -321,10 +230,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
-    DerivDhopComputeTime += usecond();
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
-  DerivComputeTime += usecond();
 }

 template<class Impl>
@ -382,12 +289,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
-  DhopTotalTime-=usecond();
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
    DhopInternalSerialComms(st,lo,U,in,out,dag);
-  DhopTotalTime+=usecond();
 }


@ -396,6 +301,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
+  GRID_TRACE("DhopInternalOverlappedComms");
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
@ -404,53 +310,58 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
-  DhopFaceTime-=usecond();
-  st.HaloExchangeOptGather(in,compressor);
-  DhopFaceTime+=usecond();
-
-  DhopCommTime -=usecond();
+  {
+    GRID_TRACE("Gather");
+    st.HaloExchangeOptGather(in,compressor);
+    accelerator_barrier();
+  }
+  
  std::vector<std::vector<CommsRequest_t> > requests;
+  auto id=traceStart("Communicate overlapped");
  st.CommunicateBegin(requests);

  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
-  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  DhopFaceTime+=usecond();
+  {
+    GRID_TRACE("MergeSHM");
+    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  }
      
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
-  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
+    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
-  DhopComputeTime+=usecond();

  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
+  traceStop(id);

  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
-  DhopFaceTime-=usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
+  {
+    GRID_TRACE("Merge");
+    st.CommsMerge(compressor);
+  }
+  

-  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
+    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
-  DhopComputeTime2+=usecond();
 }


@ -460,29 +371,30 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
 						    const FermionField &in, 
 						    FermionField &out,int dag)
 {
+  GRID_TRACE("DhopInternalSerialComms");
  Compressor compressor(dag);

  int LLs = in.Grid()->_rdimensions[0];
+
+  {
+    GRID_TRACE("HaloExchange");
+    st.HaloExchangeOpt(in,compressor);
+  }
  
-  DhopCommTime-=usecond();
-  st.HaloExchangeOpt(in,compressor);
-  DhopCommTime+=usecond();
-  
-  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
+    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
-  DhopComputeTime+=usecond();
 }


 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -494,7 +406,6 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check

@ -506,7 +417,6 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());

@ -561,12 +471,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  LatComplex    sk(_grid);  sk = Zero();
  LatComplex    sk2(_grid); sk2= Zero();
  LatComplex    W(_grid); W= Zero();
-  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
  LatComplex 	cosha(_grid);
  LatComplex 	kmu(_grid);
  LatComplex 	Wea(_grid);
  LatComplex 	Wema(_grid);
+  LatComplex 	ea(_grid);
+  LatComplex 	ema(_grid);
+  LatComplex 	eaLs(_grid);
+  LatComplex 	emaLs(_grid);
+  LatComplex 	ea2Ls(_grid);
+  LatComplex 	ema2Ls(_grid);
  LatComplex 	sinha(_grid);
  LatComplex 	sinhaLs(_grid);
  LatComplex 	coshaLs(_grid);
@ -601,39 +516,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  ////////////////////////////////////////////
  cosha = (one + W*W + sk) / (abs(W)*2.0);

-  // FIXME Need a Lattice acosh
-
-  {
-    autoView(cosha_v,cosha,CpuRead);
-    autoView(a_v,a,CpuWrite);
-    for(int idx=0;idx<_grid->lSites();idx++){
-      Coordinate lcoor(Nd);
-      Tcomplex cc;
-      //    RealD sgn;
-      _grid->LocalIndexToLocalCoor(idx,lcoor);
-      peekLocalSite(cc,cosha_v,lcoor);
-      assert((double)real(cc)>=1.0);
-      assert(fabs((double)imag(cc))<=1.0e-15);
-      cc = ScalComplex(::acosh(real(cc)),0.0);
-      pokeLocalSite(cc,a_v,lcoor);
-    }
-  }
-
-  Wea = ( exp( a) * abs(W)  );
-  Wema= ( exp(-a) * abs(W)  );
-  sinha = 0.5*(exp( a) - exp(-a));
-  sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
-  coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
+  ea = (cosha + sqrt(cosha*cosha-one));
+  ema= (cosha - sqrt(cosha*cosha-one));
+  eaLs = pow(ea,Ls);
+  emaLs= pow(ema,Ls);
+  ea2Ls = pow(ea,2.0*Ls);
+  ema2Ls= pow(ema,2.0*Ls);
+  Wea= abs(W) * ea;
+  Wema= abs(W) * ema;
+  //  a=log(ea);
+  
+  sinha = 0.5*(ea - ema);
+  sinhaLs = 0.5*(eaLs-emaLs);
+  coshaLs = 0.5*(eaLs+emaLs);

  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
-  F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
-  F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
+  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
+  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
  F = F - abs(W) * sinha * 4.0 * mass;

-  Bpp =  (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
-  Bmm =  (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
-  App =  (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
-  Amm =  (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
+  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
+  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
+  App =  (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one);
+  Amm =  (A/F) * (one - ea2Ls)  * ea  * (ea  - abs(W)) * (one - mass*mass * one);
  ABpm = (A/F) * abs(W) * sinha * 2.0  * (one + mass * coshaLs * 2.0 + mass*mass * one);

  //P+ source, P- source
@ -656,29 +561,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
      buf1_4d = Zero();
      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
      //G(s,t)
-      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
+      bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d;
      //A++*exp(a(s+t))
-      bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A+-*exp(a(s-t))
-      bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ;
      //A-+*exp(a(-s+t))
-      bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A--*exp(a(-s-t))
-      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ;

      //GL
      buf2_4d = Zero();
      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
      //G(s,t)
-      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
+      bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d;
      //B++*exp(a(s+t))
-      bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B+-*exp(a(s-t))
-      bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ;
      //B-+*exp(a(-s+t))
-      bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B--*exp(a(-s-t))
-      bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ;
    }
    InsertSlice(bufR_4d, GR, (ss-1), 0);
    InsertSlice(bufL_4d, GL, (ss-1), 0);
@ -797,28 +702,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
  W = one - M5 + sk2;

  ////////////////////////////////////////////
-  // Cosh alpha -> alpha
+  // Cosh alpha -> exp(+/- alpha)
  ////////////////////////////////////////////
  cosha =  (one + W*W + sk) / (abs(W)*2.0);

-  // FIXME Need a Lattice acosh
-  {
-  autoView(cosha_v,cosha,CpuRead);
-  autoView(a_v,a,CpuWrite);
-  for(int idx=0;idx<_grid->lSites();idx++){
-    Coordinate lcoor(Nd);
-    Tcomplex cc;
-    //    RealD sgn;
-    _grid->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(cc,cosha_v,lcoor);
-    assert((double)real(cc)>=1.0);
-    assert(fabs((double)imag(cc))<=1.0e-15);
-    cc = ScalComplex(::acosh(real(cc)),0.0);
-    pokeLocalSite(cc,a_v,lcoor);
-  }}
-  
-  Wea = ( exp( a) * abs(W)  );
-  Wema= ( exp(-a) * abs(W)  );
+  Wea = abs(W)*(cosha + sqrt(cosha*cosha-one));
+  Wema= abs(W)*(cosha - sqrt(cosha*cosha-one));
  
  num   = num + ( one - Wema ) * mass * in;
  denom= ( Wea - one ) + mass*mass * (one - Wema); 
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -76,91 +76,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  StencilOdd.BuildSurfaceList(1,vol4);
 }

-template<class Impl>
-void WilsonFermion<Impl>::Report(void)
-{
-  RealD NP = _grid->_Nprocessors;
-  RealD NN = _grid->NodeCount();
-  RealD volume = 1;
-  Coordinate latt = _grid->GlobalDimensions();
-  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-
-  if ( DhopCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls   : " << DhopCalls   << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
-
-    // Average the compute time
-    _grid->GlobalSum(DhopComputeTime);
-    DhopComputeTime/=NP;
-    RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
-
-    RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
-
-   }
-
-  if ( DerivCalls > 0 ) {
-    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
-    std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls    : " <<DerivCalls <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
-    std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
-
-    // how to count flops here?
-    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
-    std::cout << GridLogMessage << "Average mflops/s per call               ? : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node      ? : " << mflops/NP << std::endl;
-
-    // how to count flops here?
-    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)        ? : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl;  }
-
-  if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion Stencil"    <<std::endl;  Stencil.Report();
-    std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl;  StencilOdd.Report();
-  }
-  if ( DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
-    std::cout << GridLogMessage << "WilsonFermion StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
-  }
-}
-
-template<class Impl>
-void WilsonFermion<Impl>::ZeroCounters(void) {
-  DhopCalls       = 0; // ok
-  DhopCommTime    = 0;
-  DhopComputeTime = 0;
-  DhopComputeTime2= 0;
-  DhopFaceTime    = 0;
-  DhopTotalTime   = 0;
-
-  DerivCalls       = 0; // ok
-  DerivCommTime    = 0;
-  DerivComputeTime = 0;
-  DerivDhopComputeTime = 0;
-
-  Stencil.ZeroCounters();
-  StencilEven.ZeroCounters();
-  StencilOdd.ZeroCounters();
-  Stencil.ZeroCountersi();
-  StencilEven.ZeroCountersi();
-  StencilOdd.ZeroCountersi();
-}
-
-
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@ -320,7 +235,6 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
-  DerivCalls++;
  assert((dag == DaggerNo) || (dag == DaggerYes));

  Compressor compressor(dag);
@ -329,11 +243,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Atilde(B.Grid());
  Atilde = A;

-  DerivCommTime-=usecond();
  st.HaloExchange(B, compressor);
-  DerivCommTime+=usecond();

-  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
@ -341,7 +252,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;

-    DerivDhopComputeTime -= usecond();
    int Ls=1;
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);

@ -349,9 +259,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
-    DerivDhopComputeTime += usecond();
  }
-  DerivComputeTime += usecond();
 }

 template <class Impl>
@ -398,7 +306,6 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
-  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@ -410,7 +317,6 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -423,7 +329,6 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@ -488,14 +393,12 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
-  DhopTotalTime-=usecond();
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
 #endif
    DhopInternalSerial(st,lo,U,in,out,dag);
-  DhopTotalTime+=usecond();
 }

 template <class Impl>
@ -504,6 +407,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag)
 {
+  GRID_TRACE("DhopOverlapped");
  assert((dag == DaggerNo) || (dag == DaggerYes));

  Compressor compressor(dag);
@ -514,53 +418,55 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  /////////////////////////////
  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
-  DhopFaceTime-=usecond();
-  st.HaloGather(in,compressor);
-  DhopFaceTime+=usecond();
+  {
+    GRID_TRACE("Gather");
+    st.HaloGather(in,compressor);
+  }

-  DhopCommTime -=usecond();
+  tracePush("Communication");
  st.CommunicateBegin(requests);

  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
-  st.CommsMergeSHM(compressor);
-  DhopFaceTime+=usecond();
+  {
+    GRID_TRACE("MergeSHM");
+    st.CommsMergeSHM(compressor);
+  }

  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
-  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
+    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
-  DhopComputeTime+=usecond();

  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
-
-  DhopFaceTime-=usecond();
-  st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
+  tracePop("Communication");

+  {
+    GRID_TRACE("Merge");
+    st.CommsMerge(compressor);
+  }
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////

-  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  } else {
+    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
-  DhopComputeTime2+=usecond();
 };


@ -570,20 +476,22 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
+  GRID_TRACE("DhopSerial");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
-  DhopCommTime-=usecond();
-  st.HaloExchange(in, compressor);
-  DhopCommTime+=usecond();
+  {
+    GRID_TRACE("HaloExchange");
+    st.HaloExchange(in, compressor);
+  }

-  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
+    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
-  DhopComputeTime+=usecond();
 };
 /*Change ends */

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -72,20 +72,15 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  if (SE->_is_local) {						\
    int perm= SE->_permute;					\
    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
-    spProj(chi,tmp);						\
-  } else if ( st.same_node[Dir] ) {				\
-    chi = coalescedRead(buf[SE->_offset],lane);			\
-  }								\
-  acceleratorSynchronise();						\
-  if (SE->_is_local || st.same_node[Dir] ) {			\
-    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
-    Recon(result, Uchi);					\
-  }								\
+    spProj(chi,tmp);							\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
+    Recon(result, Uchi);						\
+  }									\
  acceleratorSynchronise();

 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+  if (!SE->_is_local ) {		\
    auto chi = coalescedRead(buf[SE->_offset],lane);		\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
@ -416,19 +411,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

-#define KERNEL_CALL_TMP(A) \
-  const uint64_t    NN = Nsite*Ls;					\
-  auto U_p = & U_v[0];							\
-  auto in_p = & in_v[0];						\
-  auto out_p = & out_v[0];						\
-  auto st_p = st_v._entries_p;						\
-  auto st_perm = st_v._permute_type;					\
-  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
-      int sF = ss;							\
-      int sU = ss/Ls;							\
-      WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p);	\
-    });									\
-  accelerator_barrier();

 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@ -448,8 +430,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
      int sF = ptr[ss];							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-    });									\
-  accelerator_barrier();
+    });									

 #define ASM_CALL(A)							\
  thread_for( ss, Nsite, {						\
@ -471,7 +452,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifdef SYCL_HACK     
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; }
 #else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #endif     
--- a/Grid/qcd/action/filters/DDHMCFilter.h
+++ b/Grid/qcd/action/filters/DDHMCFilter.h
@ -91,6 +91,19 @@ struct DDHMCFilter: public MomentumFilterBase<GaugeField>
 	  U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
+	if ( Width==4) { 
+	  U    = where(mod(coor,B1)==Integer(B1-4),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-3),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(2)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(3)   ,zzz,U); 
+	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	  U_mu = where(mod(coor,B1)==Integer(B1-5),zzz_mu,U_mu); 
+	  PokeIndex<LorentzIndex>(U, U_mu, mu);
+	}
      }

    }
--- a/Grid/qcd/action/filters/MomentumFilter.h
+++ b/Grid/qcd/action/filters/MomentumFilter.h
@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct MomentumFilterBase{
  virtual void applyFilter(MomentaField &P) const = 0;
+  virtual ~MomentumFilterBase(){};
 };

 //Do nothing
@ -83,7 +84,6 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
    
  }

-
 };


--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@ -67,6 +67,36 @@ NAMESPACE_BEGIN(Grid);
      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
    };

+    template<class Impl,class ImplF>
+    class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
+    public:
+      typedef OneFlavourRationalParams Params;
+    private:
+      static RationalActionParams transcribe(const Params &in){
+	RationalActionParams out;
+	out.inv_pow = 2;
+	out.lo = in.lo;
+	out.hi = in.hi;
+	out.MaxIter = in.MaxIter;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
+	out.action_degree = out.md_degree = in.degree;
+	out.precision = in.precision;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
+	return out;
+      }
+
+    public:
+      OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+								 FermionOperator<Impl>  &_DenOp, 
+								 FermionOperator<ImplF>  &_NumOpF, 
+								 FermionOperator<ImplF>  &_DenOpF, 
+								 const Params & p, Integer ReliableUpdateFreq
+							) : 
+	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){}
+
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
+    };
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@ -85,7 +85,12 @@ NAMESPACE_BEGIN(Grid);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };

-      virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
+      virtual std::string action_name(){
+	std::stringstream sstream;
+	sstream<<"OneFlavourRatioRationalPseudoFermionAction("
+	       <<DenOp.Mass()<<") / det("<<NumOp.Mass()<<")";
+	return sstream.str();
+      }
      
      virtual std::string LogParameters(){
 	std::stringstream sstream;
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -143,9 +143,10 @@ protected:
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();

+      //      DumpSliceNorm("force ",force,Nd-1);
      MomFilter->applyFilter(force);
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl;
-      DumpSliceNorm("force ",force,Nd-1);
+      DumpSliceNorm("force filtered ",force,Nd-1);
      
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
@ -153,7 +154,7 @@ protected:
      Real force_max   = std::sqrt(maxLocalNorm2(force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    

-      as[level].actions.at(a)->deriv_log(force_abs,force_max);
+      as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
      
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
@ -285,6 +286,8 @@ public:
 		  <<"["<<level<<"]["<< actionID<<"] : "
 		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
+		  <<" Fdt max  "  << as[level].actions.at(actionID)->Fdt_max_average()
+		  <<" Fdt norm "  << as[level].actions.at(actionID)->Fdt_norm_average()
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -290,6 +290,8 @@ public:
  std::vector<Decompress> DecompressionsSHM;
  std::vector<CopyReceiveBuffer> CopyReceiveBuffers ;
  std::vector<CachedTransfer> CachedTransfers;
+  std::vector<CommsRequest_t> MpiReqs;
+  
  ///////////////////////////////////////////////////////////
  // Unified Comms buffers for all directions
  ///////////////////////////////////////////////////////////
@ -357,9 +359,9 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    reqs.resize(Packets.size());
+    accelerator_barrier();
    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromBegin(reqs[i],
+      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].recv_buf,
@ -370,41 +372,19 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromComplete(reqs[i],i);
-    }
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
-      /////////////////////////////////////////////////////////
-      // several way threaded on different communicators.
-      // Cannot combine with Dirichlet operators
-      // This scheme is needed on Intel Omnipath for best performance
-      // Deprecate once there are very few omnipath clusters
-      /////////////////////////////////////////////////////////
-      int nthreads = CartesianCommunicator::nCommThreads;
-      int old = GridThread::GetThreads();
-      GridThread::SetThreads(nthreads);
-      thread_for(i,Packets.size(),{
-	  _grid->StencilSendToRecvFrom(Packets[i].send_buf,
-				       Packets[i].to_rank,Packets[i].do_send,
-				       Packets[i].recv_buf,
-				       Packets[i].from_rank,Packets[i].do_recv,
-				       Packets[i].bytes,i);
-      });
-      GridThread::SetThreads(old);
-    } else { 
-      /////////////////////////////////////////////////////////
-      // Concurrent and non-threaded asynch calls to MPI
-      /////////////////////////////////////////////////////////
-      std::vector<std::vector<CommsRequest_t> > reqs;
-      this->CommunicateBegin(reqs);
-      this->CommunicateComplete(reqs);
-    }
+    /////////////////////////////////////////////////////////
+    // Concurrent and non-threaded asynch calls to MPI
+    /////////////////////////////////////////////////////////
+    std::vector<std::vector<CommsRequest_t> > reqs;
+    this->CommunicateBegin(reqs);
+    this->CommunicateComplete(reqs);
  }

  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
@ -484,7 +464,6 @@ public:
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);

-    accelerator_barrier();
  }

  /////////////////////////
@ -499,6 +478,7 @@ public:
    Packets.resize(0);
    CopyReceiveBuffers.resize(0);
    CachedTransfers.resize(0);
+    MpiReqs.resize(0);
  }
  void AddCopy(void *from,void * to, Integer bytes)
  {
@ -711,7 +691,9 @@ public:
    this->_comms_recv.resize(npoints); 
    this->same_node.resize(npoints);

-    if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up
+    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
+
+    DirichletBlock(p.dirichlet); // comms send/recv set up

    _unified_buffer_size=0;
    surface_list.resize(0);
@ -793,7 +775,6 @@ public:
      u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
      u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    }
-
    PrecomputeByteOffsets();
  }

@ -1105,7 +1086,6 @@ public:
 	  // Gather locally
 	  ////////////////////////////////////////////////////////
 	  assert(send_buf!=NULL);
-
 	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
 	}

@ -1212,8 +1192,9 @@ public:
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
 	}

-	if ( comms_send || comms_recv )
+	if ( comms_send || comms_recv ) {
 	  Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
+	}
 	face_idx++;

 	//spointers[0] -- low
@ -1270,10 +1251,6 @@ public:
    return 0;
  }

-  void ZeroCounters(void) { };
-
-  void Report(void) {   };
-
 };
 NAMESPACE_END(Grid);

--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -1,6 +1,7 @@
 #include <Grid/GridCore.h>

 NAMESPACE_BEGIN(Grid);
+int      world_rank; // Use to control world rank for print guarding
 int      acceleratorAbortOnGpuError=1;
 uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
@ -16,7 +17,7 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
-cudaStream_t cpuStream;
+cudaStream_t computeStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@ -24,7 +25,8 @@ void acceleratorInit(void)
  gpu_props = new cudaDeviceProp[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_SLURM  )) != NULL) { world_rank = atoi(localRankStr);}
@ -99,7 +101,7 @@ void acceleratorInit(void)

  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
-  cudaStreamCreate(&cpuStream);
+  cudaStreamCreate(&computeStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@ -114,7 +116,7 @@ void acceleratorInit(void)
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
-hipStream_t cpuStream;
+hipStream_t computeStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@ -122,7 +124,8 @@ void acceleratorInit(void)
  gpu_props = new hipDeviceProp_t[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
@ -183,7 +186,7 @@ void acceleratorInit(void)
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
-  hipStreamCreate(&cpuStream);
+  hipStreamCreate(&computeStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@ -210,7 +213,8 @@ void acceleratorInit(void)
 #endif
  
  char * localRankStr = NULL;
-  int rank = 0, world_rank=0; 
+  int rank = 0;
+  world_rank=0; 

  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -107,7 +107,7 @@ void     acceleratorInit(void);

 extern int acceleratorAbortOnGpuError;
 extern cudaStream_t copyStream;
-extern cudaStream_t cpuStream;
+extern cudaStream_t computeStream;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@ -135,7 +135,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda);	\
+    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
  }

 #define accelerator_for6dNB(iter1, num1,				\
@ -154,7 +154,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_blocks (num1,num2,num3);					\
    dim3 cu_threads(num4,num5,num6);					\
-    Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
+    Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
  }

 template<typename lambda>  __global__
@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,

 #define accelerator_barrier(dummy)					\
  {									\
-    cudaStreamSynchronize(cpuStream);					\
+    cudaStreamSynchronize(computeStream);				\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
      printf("accelerator_barrier(): Cuda error %s \n",			\
@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator_inline __host__ __device__ inline

 extern hipStream_t copyStream;
-extern hipStream_t cpuStream;
+extern hipStream_t computeStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@ -362,16 +362,15 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-			 0,cpuStream,					\
+   	                 0,computeStream,						\
 			 num1,num2,nsimd, lambda);			\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
-			 0,cpuStream,					\
+			 0,computeStream,				\
 			 num1,num2,nsimd, lambda);			\
    } \
  }

-
 template<typename lambda>  __global__
 __launch_bounds__(64,1)
 void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
@ -400,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 #define accelerator_barrier(dummy)				\
  {								\
-    hipStreamSynchronize(cpuStream);					\
+    hipStreamSynchronize(computeStream);			\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@ -443,7 +442,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas

 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);
+  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
 inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -356,6 +356,11 @@ void Grid_init(int *argc,char ***argv)
  //////////////////////////////////////////////////////////
  CartesianCommunicator::Init(argc,argv);

+  GridLogger::GlobalStopWatch.Stop();
+  CartesianCommunicator::BarrierWorld();
+  GridLogger::GlobalStopWatch.Reset();// Back to zero with synchronised clock
+  GridLogger::GlobalStopWatch.Start();
+
  ////////////////////////////////////
  // Banner after MPI (unless GPU)
  ////////////////////////////////////
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
@ -128,8 +128,14 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#if 1
+      RealD delta=1.e-4;
+      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
+#else      
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#endif
      MPCG(src,psi);
    }
  };
@ -141,6 +147,10 @@ int main(int argc, char **argv) {
  using namespace Grid;

  Grid_init(&argc, &argv);
+
+  CartesianCommunicator::BarrierWorld();
+  std::cout << GridLogMessage << " Clock skew check" <<std::endl;
+  
  int threads = GridThread::GetThreads();

   // Typedefs to simplify notation
@ -161,7 +171,7 @@ int main(int argc, char **argv) {
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps =  4;
+  MD.MDsteps =  6;
  MD.trajL   = 1.0;

  HMCparameters HMCparams;
@ -183,7 +193,7 @@ int main(int argc, char **argv) {
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-
+  std::cout << "loaded NERSC checpointer"<<std::endl;
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
@ -204,7 +214,8 @@ int main(int argc, char **argv) {
  Real light_mass   = 7.8e-4;
  Real strange_mass = 0.02132;
  Real pv_mass      = 1.0;
-  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });

  // FIXME:
  // Same in MC and MD
@ -287,6 +298,7 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
+  std::cout << "loaded NERSC gauge field"<<std::endl;


  // These lines are unecessary if BC are all periodic
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc
@ -0,0 +1,474 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+      /* Debugging instances of objects; references are stored
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+      */
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+      // Assumption made in code to extract gauge field
+      // We could avoid storing LinopD reference alltogether ?
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Must snarf a single precision copy of the gauge field in Linop_d argument
+      ////////////////////////////////////////////////////////////////////////////////////
+      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+
+      GridBase * GridPtrF = SinglePrecGrid4;
+      GridBase * GridPtrD = FermOpD.Umu.Grid();
+      GaugeFieldF     U_f  (GridPtrF);
+      GaugeLinkFieldF Umu_f(GridPtrF);
+      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Moving this to a Clone method of fermion operator would allow to duplicate the 
+      // physics parameters and decrease gauge field copies
+      ////////////////////////////////////////////////////////////////////////////////////
+      GaugeLinkFieldD Umu_d(GridPtrD);
+      for(int mu=0;mu<Nd*2;mu++){ 
+	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+	precisionChange(Umu_f,Umu_d);
+	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      }
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+#if 1
+      RealD delta=1.e-4;
+      std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
+#else      
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+#endif
+      MPCG(src,psi);
+    }
+  };
+
+NAMESPACE_END(Grid);
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonImplF FermionImplPolicyF;
+
+  typedef MobiusFermionR FermionAction;
+  typedef MobiusFermionF FermionActionF;
+  typedef typename FermionAction::FermionField FermionField;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps =  6;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 1077;
+  HMCparams.Trajectories     = 1;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_DDHMC_lat";
+  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  RealD M5  = 1.8;
+  RealD b   = 1.5;
+  RealD c   = 0.5;
+  Real beta         = 2.31;
+  //  Real light_mass   = 5.4e-4;
+  Real light_mass   = 7.8e-4;
+  Real strange_mass = 0.02132;
+  Real pv_mass      = 1.0;
+  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+
+  // FIXME:
+  // Same in MC and MD
+  // Need to mix precision too
+  OneFlavourRationalParams SFRp; // Strange
+  SFRp.lo       = 4.0e-3;
+  SFRp.hi       = 90.0;
+  SFRp.MaxIter  = 60000;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 1.0e-6;
+  SFRp.degree   = 12;
+  SFRp.precision= 50;
+  SFRp.BoundsCheckFreq=0;
+
+  OneFlavourRationalParams OFRp; // Up/down
+  OFRp.lo       = 2.0e-5;
+  OFRp.hi       = 90.0;
+  OFRp.MaxIter  = 60000;
+  OFRp.tolerance= 1.0e-8;
+  OFRp.mdtolerance= 1.0e-6;
+  //  OFRp.degree   = 20; converges
+  //  OFRp.degree   = 16;
+  OFRp.degree   = 12;
+  OFRp.precision= 80;
+  OFRp.BoundsCheckFreq=0;
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
+
+  ////////////////////////////////////////////////////////////////
+  // Domain decomposed
+  ////////////////////////////////////////////////////////////////
+  Coordinate latt4  = GridPtr->GlobalDimensions();
+  Coordinate mpi    = GridPtr->ProcessorGrid();
+  Coordinate shm;
+
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+  
+  Coordinate CommDim(Nd);
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+
+  Coordinate NonDirichlet(Nd+1,0);
+  Coordinate Dirichlet(Nd+1,0);
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Coordinate Block4(Nd);
+  Block4[0] = Dirichlet[1];
+  Block4[1] = Dirichlet[2];
+  Block4[2] = Dirichlet[3];
+  Block4[3] = Dirichlet[4];
+
+  int Width=3;
+  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4,Width));
+
+  //////////////////////////
+  // Fermion Grids
+  //////////////////////////
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi);
+  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeFieldF UF(GridPtrF);
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  Params.dirichlet=NonDirichlet;
+  FermionAction::ImplParams ParamsDir(boundary);
+  ParamsDir.dirichlet=Dirichlet;
+
+  //  double StoppingCondition = 1e-14;
+  //  double MDStoppingCondition = 1e-9;
+  double StoppingCondition = 1e-10;
+  double MDStoppingCondition = 1e-7;
+  double MDStoppingConditionLoose = 1e-6;
+  double MaxCGIterations = 300000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  ActionLevel<HMCWrapper::Field> Level3(8);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir);
+  FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, ParamsDir);
+  
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
+  Level1.push_back(&StrangePseudoFermionBdy); // ok
+  Level2.push_back(&StrangePseudoFermionLocal);
+  Level1.push_back(&StrangePseudoFermionPVBdy); //ok
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+  std::vector<int> dirichlet_den;
+  std::vector<int> dirichlet_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
+  }
+
+  for(int h=0;h<n_hasenbusch;h++){
+    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
+  }
+  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionActionF *> NumeratorsF;
+  std::vector<FermionAction *> Denominators;
+  std::vector<FermionActionF *> DenominatorsF;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+
+#define MIXED_PRECISION
+#ifdef MIXED_PRECISION
+  std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
+#else
+  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+#endif
+  std::vector<MxPCG *> ActionMPCG;
+  std::vector<MxPCG *> MPCG;
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  std::vector<LinearOperatorD *> LinOpD;
+  std::vector<LinearOperatorF *> LinOpF; 
+  
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage
+	      << " 2f quotient Action ";
+    std::cout << "det D("<<light_den[h]<<")";
+    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
+    std::cout << "/ det D("<<light_num[h]<<")";
+    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
+    std::cout << std::endl;
+
+    FermionAction::ImplParams ParamsNum(boundary);
+    FermionAction::ImplParams ParamsDen(boundary);
+    FermionActionF::ImplParams ParamsNumF(boundary);
+    FermionActionF::ImplParams ParamsDenF(boundary);
+    
+    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
+    else                      ParamsNum.dirichlet = NonDirichlet;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
+
+    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
+    else                      ParamsDen.dirichlet = NonDirichlet;
+
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
+
+    ParamsDenF.dirichlet = ParamsDen.dirichlet;
+    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
+
+    ParamsNumF.dirichlet = ParamsNum.dirichlet;
+    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
+
+    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
+    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
+
+    double conv  = MDStoppingCondition;
+    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
+    const int MX_inner = 5000;
+    MPCG.push_back(new MxPCG(conv,
+			     MX_inner,
+			     MaxCGIterations,
+			     GridPtrF,
+			     FrbGridF,
+			     *DenominatorsF[h],*Denominators[h],
+			     *LinOpF[h], *LinOpD[h]) );
+
+    ActionMPCG.push_back(new MxPCG(StoppingCondition,
+				   MX_inner,
+				   MaxCGIterations,
+				   GridPtrF,
+				   FrbGridF,
+				   *DenominatorsF[h],*Denominators[h],
+				   *LinOpF[h], *LinOpD[h]) );
+
+    
+    if(h!=0) {
+      //      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
+      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
+    } else {
+#ifdef MIXED_PRECISION
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
+			   *Numerators[h],*Denominators[h],
+			   *NumeratorsF[h],*DenominatorsF[h],
+			   OFRp, 500) );
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
+			   *Numerators[h],*Denominators[h],
+			   *NumeratorsF[h],*DenominatorsF[h],
+			   OFRp, 500) );
+#else
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+#endif
+    }
+  }
+
+  int nquo=Quotients.size();
+  Level1.push_back(Bdys[0]);
+  Level1.push_back(Bdys[1]);
+  for(int h=0;h<nquo-1;h++){
+    Level2.push_back(Quotients[h]);
+  }
+  Level2.push_back(Quotients[nquo-1]);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+  /////////////////////////////////////////////////////////////
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@ -420,7 +420,6 @@ public:
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-	Dw.ZeroCounters();

 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
@ -589,7 +588,6 @@ public:
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
-	Ds.ZeroCounters();

 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -186,7 +186,6 @@ int main (int argc, char ** argv)

  if (1) {
    FGrid->Barrier();
-    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
@ -231,7 +230,6 @@ int main (int argc, char ** argv)
      exit(-1);
    }
    assert (norm2(err)< 1.0e-4 );
-    Dw.Report();
  }

  if (1)
@ -306,7 +304,6 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
-    Dw.ZeroCounters();
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
@ -328,7 +325,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
-    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@ -93,14 +93,11 @@ int main (int argc, char ** argv)
  int ncall =1000;
  if (1) {
    FGrid->Barrier();
-    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@ -114,7 +111,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    Dw.Report();
  }


@ -136,14 +132,11 @@ int main (int argc, char ** argv)
  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
  if (1) {
    FGrid_d->Barrier();
-    DwD.ZeroCounters();
    DwD.Dhop(src_d,result_d,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      DwD.Dhop(src_d,result_d,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid_d->Barrier();
@ -157,7 +150,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    DwD.Report();
  }
 #endif
  Grid_finalize();
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -103,35 +103,30 @@ int main (int argc, char ** argv)
 #define BENCH_DW(A,...)			\
    Dw. A (__VA_ARGS__);				\
    FGrid->Barrier();				\
-    Dw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      Dw. A (__VA_ARGS__);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

 #define BENCH_ZDW(A,in,out)			\
    zDw. A (in,out);				\
    FGrid->Barrier();				\
-    zDw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      zDw. A (in,out);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    zDw.CayleyReport();							\
    std::cout<<GridLogMessage << "Called ZDw " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

 #define BENCH_DW_SSC(A,in,out)			\
    Dw. A (in,out);				\
    FGrid->Barrier();				\
-    Dw.CayleyZeroCounters();      \
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      __SSC_START ;				\
@ -140,7 +135,6 @@ int main (int argc, char ** argv)
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@ -155,7 +155,6 @@ int main (int argc, char ** argv)
  //int ncall=1;

  // Counters
-  Dw.ZeroCounters();
  Grid.Barrier();

  double t0=usecond();
@ -201,7 +200,6 @@ int main (int argc, char ** argv)
  err = ref-result;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;

-  Dw.Report();
  
  // guard
  double err0 = norm2(err);
--- a/configure.ac
+++ b/configure.ac
@ -128,6 +128,26 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### tracing
+AC_ARG_ENABLE([tracing],
+    [AC_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer], [enable tracing])],
+    [ac_TRACING=${enable_tracing}], [ac_TRACING=none])
+
+case ${ac_TRACING} in
+    nvtx)
+        AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
+	LIBS="${LIBS} -lnvToolsExt64_1"
+	;;
+    roctx)
+        AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
+	LIBS="${LIBS} -lroctx64"
+	;;
+    timer)
+        AC_DEFINE([GRID_TRACING_TIMER],[1],[use TIMER]);;
+    *)
+	AC_DEFINE([GRID_TRACING_NONE],[1],[no tracing]);;
+esac
+
 ############### fermions
 AC_ARG_ENABLE([fermion-reps],
     [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])],
--- a/examples/Example_christoph.cc
+++ b/examples/Example_christoph.cc
@ -0,0 +1,436 @@
+/*
+ * Warning: This code illustrative only: not well tested, and not meant for production use
+ * without regression / tests being applied
+ */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+RealD LLscale =1.0;
+RealD LCscale =1.0;
+
+template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  GridBase *grid;
+  GaugeField U;
+  
+  CovariantLaplacianCshift(GaugeField &_U)    :
+    grid(_U.Grid()),
+    U(_U) {  };
+
+  virtual GridBase *Grid(void) { return grid; };
+
+  virtual void  M    (const Field &in, Field &out)
+  {
+    out=Zero();
+    for(int mu=0;mu<Nd-1;mu++) {
+      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
+      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
+      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
+      out = out + 2.0*in;
+    }
+  };
+  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
+  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
+};
+
+void MakePhase(Coordinate mom,LatticeComplex &phase)
+{
+  GridBase *grid = phase.Grid();
+  auto latt_size = grid->GlobalDimensions();
+  ComplexD ci(0.0,1.0);
+  phase=Zero();
+
+  LatticeComplex coor(phase.Grid());
+  for(int mu=0;mu<Nd;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    phase = phase + (TwoPiL * mom[mu]) * coor;
+  }
+  phase = exp(phase*ci);
+}
+
+void PointSource(Coordinate &coor,LatticePropagator &source)
+{
+  //  Coordinate coor({0,0,0,0});
+  source=Zero();
+  SpinColourMatrix kronecker; kronecker=1.0;
+  pokeSite(kronecker,source,coor);
+}
+void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex noise(grid);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+
+  RealD nrm=1.0/sqrt(2);
+  bernoulli(RNG, noise); // 0,1 50:50
+
+  noise = (2.*noise - Complex(1,1))*nrm;
+
+  LatticeCoordinate(t,Tdir);
+  noise = where(t==Integer(tslice), noise, zz);
+
+  source = 1.0;
+  source = source*noise;
+  std::cout << " Z2 wall " << norm2(source) << std::endl;
+}
+template<class Field>
+void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
+{
+  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  Laplacian_t Laplacian(U);
+
+  Integer Iterations = 40;
+  Real width = 2.0;
+  Real coeff = (width*width) / Real(4*Iterations);
+
+  Field tmp(U.Grid());
+  smeared=unsmeared;
+  //  chi = (1-p^2/2N)^N kronecker
+  for(int n = 0; n < Iterations; ++n) {
+    Laplacian.M(smeared,tmp);
+    smeared = smeared - coeff*tmp;
+    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
+  }
+}
+void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
+{
+  LatticePropagator tmp(source.Grid());
+  PointSource(site,source);
+  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
+  tmp = source;
+  GaussianSmear(U,tmp,source);
+  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
+}
+void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
+{
+  Z2WallSource(RNG,tslice,source);
+  auto tmp = source;
+  GaussianSmear(U,tmp,source);
+}
+void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
+{
+  assert(mom.size()==Nd);
+  assert(mom[Tdir] == 0);
+
+  GridBase * grid = spectator.Grid();
+
+
+  LatticeInteger ts(grid);
+  LatticeCoordinate(ts,Tdir);
+  source = Zero();
+  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
+
+  LatticeComplex phase(grid);
+  MakePhase(mom,phase);
+
+  source = source *phase;
+}
+
+template<class Action>
+void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+ GridBase *UGrid = source.Grid();
+  GridBase *FGrid = D.FermionGrid();
+  bool fiveD = true; //calculate 5d free propagator
+  RealD mass = D.Mass();
+  LatticeFermion src4  (UGrid);
+  LatticeFermion result4  (UGrid);
+  LatticeFermion result5(FGrid);
+  LatticeFermion src5(FGrid);
+  LatticePropagator prop5(FGrid);
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+ 
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+      D.FreePropagator(src5,result5,mass,true);
+      std::cout<<GridLogMessage
+               <<"Free 5D prop spin "<<s<<" color "<<c
+               <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+
+  LatticePropagator Vector_mu(UGrid);
+  LatticeComplex    VV (UGrid);
+  std::vector<TComplex> sumVV;
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    int Nt = sumVV.size();
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+}
+template<class Action>
+void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+  bool fiveD = false; //calculate 4d free propagator
+  RealD mass = D.Mass();
+  GridBase *UGrid = source.Grid();
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion result4  (UGrid); 
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+      D.FreePropagator(src4,result4,mass,false);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+}
+
+template<class Action>
+void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{
+  GridBase *UGrid = D.GaugeGrid();
+  GridBase *FGrid = D.FermionGrid();
+
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion src5  (FGrid); 
+  LatticeFermion result5(FGrid);
+  LatticeFermion result4(UGrid);
+  LatticePropagator prop5(FGrid);
+  
+  ConjugateGradient<LatticeFermion> CG(1.0e-7,100000);
+  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
+   for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+
+      result5=Zero();
+      schur(D,src5,result5,ZG);
+      std::cout<<GridLogMessage
+	       <<"spin "<<s<<" color "<<c
+	       <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+  LatticePropagator Axial_mu(UGrid); 
+  LatticePropagator Vector_mu(UGrid); 
+
+  LatticeComplex    PA (UGrid); 
+  LatticeComplex    VV (UGrid); 
+  LatticeComplex    PJ5q(UGrid);
+  LatticeComplex    PP (UGrid);
+
+  std::vector<TComplex> sumPA;
+  std::vector<TComplex> sumVV;
+  std::vector<TComplex> sumPP;
+  std::vector<TComplex> sumPJ5q;
+
+  Gamma g5(Gamma::Algebra::Gamma5);
+  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
+  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
+  sliceSum(PA,sumPA,Tdir);
+
+  int Nt{static_cast<int>(sumPA.size())};
+
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
+
+  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
+  sliceSum(PP,sumPP,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
+  
+  D.ContractJ5q(prop5,PJ5q);
+  sliceSum(PJ5q,sumPJ5q,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
+
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    //    auto ss=sliceSum(Vector_mu,Tdir);
+    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+
+}
+
+class MesonFile: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
+};
+
+void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
+    {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
+    {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
+    {Gamma::Algebra::Identity,Gamma::Algebra::Identity}
+  };
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
+      RealD Ct = real(corr[t]);
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
+		<< " deltaC " <<Ct-Cont<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+int main (int argc, char ** argv)
+{
+
+  Grid_init(&argc,&argv);
+  int Ls= atoi(getenv("Ls"));
+
+  // Double precision grids
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  //  std::vector<int> seeds4({1,2,3,4}); 
+  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeGaugeField Umu(UGrid);
+  std::string config;
+  RealD M5=atof(getenv("M5"));
+  RealD mq = atof(getenv("mass"));
+  int   point_x = atoi(getenv("point_x"));
+  int   point_y = atoi(getenv("point_y"));
+  int   point_z = atoi(getenv("point_z"));
+  int   point_t = atoi(getenv("point_t"));
+  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
+  if( argc > 1 && argv[1][0] != '-' )
+  {
+    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, argv[1]);
+    config=argv[1];
+    LLscale = 1.0;
+    LCscale = 1.0;
+  } else {
+    printf("Expected a configuration");
+    exit(0);
+  }
+
+  int nmass = masses.size();
+
+  typedef MobiusFermionR FermionActionR;
+  std::vector<FermionActionR *> FermActs;
+  
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+
+  for(auto mass: masses) {
+    std::vector<Complex> boundary = {1,1,1,-1};
+    FermionActionR::ImplParams Params(boundary);
+    RealD b=1.5;
+    RealD c=0.5;
+    FermActs.push_back(new FermionActionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
+  }
+
+  LatticePropagator point_source(UGrid);
+
+  Coordinate Origin({point_x,point_y,point_z,point_t});
+  PointSource   (Origin,point_source);
+  
+  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+
+  for(int m=0;m<nmass;m++) {
+    Solve(*FermActs[m],point_source   ,PointProps[m]);
+  }
+
+  LatticeComplex phase(UGrid);
+  Coordinate mom({0,0,0,0});
+  MakePhase(mom,phase);
+  
+  for(int m1=0 ;m1<nmass;m1++) {
+  for(int m2=m1;m2<nmass;m2++) {
+    std::stringstream ssp,ssg,ssz;
+
+    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
+
+    std::cout << "CG determined VV correlation function"<<std::endl;
+    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
+    
+  }}
+
+  Grid_finalize();
+}
+
+
+
--- a/examples/Example_taku1.cc
+++ b/examples/Example_taku1.cc
@ -0,0 +1,479 @@
+/*
+ * Warning: This code illustrative only: not well tested, and not meant for production use
+ * without regression / tests being applied
+ */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+RealD LLscale =1.0;
+RealD LCscale =1.0;
+
+template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  GridBase *grid;
+  GaugeField U;
+  
+  CovariantLaplacianCshift(GaugeField &_U)    :
+    grid(_U.Grid()),
+    U(_U) {  };
+
+  virtual GridBase *Grid(void) { return grid; };
+
+  virtual void  M    (const Field &in, Field &out)
+  {
+    out=Zero();
+    for(int mu=0;mu<Nd-1;mu++) {
+      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
+      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
+      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
+      out = out + 2.0*in;
+    }
+  };
+  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
+  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
+};
+
+void MakePhase(Coordinate mom,LatticeComplex &phase)
+{
+  GridBase *grid = phase.Grid();
+  auto latt_size = grid->GlobalDimensions();
+  ComplexD ci(0.0,1.0);
+  phase=Zero();
+
+  LatticeComplex coor(phase.Grid());
+  for(int mu=0;mu<Nd;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    phase = phase + (TwoPiL * mom[mu]) * coor;
+  }
+  phase = exp(phase*ci);
+}
+
+void PointSource(Coordinate &coor,LatticePropagator &source)
+{
+  //  Coordinate coor({0,0,0,0});
+  source=Zero();
+  SpinColourMatrix kronecker; kronecker=1.0;
+  pokeSite(kronecker,source,coor);
+}
+void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex noise(grid);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+
+  RealD nrm=1.0/sqrt(2);
+  bernoulli(RNG, noise); // 0,1 50:50
+
+  noise = (2.*noise - Complex(1,1))*nrm;
+
+  LatticeCoordinate(t,Tdir);
+  noise = where(t==Integer(tslice), noise, zz);
+
+  source = 1.0;
+  source = source*noise;
+  std::cout << " Z2 wall " << norm2(source) << std::endl;
+}
+template<class Field>
+void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
+{
+  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  Laplacian_t Laplacian(U);
+
+  Integer Iterations = 40;
+  Real width = 2.0;
+  Real coeff = (width*width) / Real(4*Iterations);
+
+  Field tmp(U.Grid());
+  smeared=unsmeared;
+  //  chi = (1-p^2/2N)^N kronecker
+  for(int n = 0; n < Iterations; ++n) {
+    Laplacian.M(smeared,tmp);
+    smeared = smeared - coeff*tmp;
+    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
+  }
+}
+void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
+{
+  LatticePropagator tmp(source.Grid());
+  PointSource(site,source);
+  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
+  tmp = source;
+  GaussianSmear(U,tmp,source);
+  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
+}
+void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
+{
+  Z2WallSource(RNG,tslice,source);
+  auto tmp = source;
+  GaussianSmear(U,tmp,source);
+}
+void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
+{
+  assert(mom.size()==Nd);
+  assert(mom[Tdir] == 0);
+
+  GridBase * grid = spectator.Grid();
+
+
+  LatticeInteger ts(grid);
+  LatticeCoordinate(ts,Tdir);
+  source = Zero();
+  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
+
+  LatticeComplex phase(grid);
+  MakePhase(mom,phase);
+
+  source = source *phase;
+}
+
+template<class Action>
+void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+ GridBase *UGrid = source.Grid();
+  GridBase *FGrid = D.FermionGrid();
+  bool fiveD = true; //calculate 5d free propagator
+  RealD mass = D.Mass();
+  LatticeFermion src4  (UGrid);
+  LatticeFermion result4  (UGrid);
+  LatticeFermion result5(FGrid);
+  LatticeFermion src5(FGrid);
+  LatticePropagator prop5(FGrid);
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+ 
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+      D.FreePropagator(src5,result5,mass,true);
+      std::cout<<GridLogMessage
+               <<"Free 5D prop spin "<<s<<" color "<<c
+               <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+
+  LatticePropagator Vector_mu(UGrid);
+  LatticeComplex    VV (UGrid);
+  std::vector<TComplex> sumVV;
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    int Nt = sumVV.size();
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+}
+template<class Action>
+void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{			   
+  bool fiveD = false; //calculate 4d free propagator
+  RealD mass = D.Mass();
+  GridBase *UGrid = source.Grid();
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion result4  (UGrid); 
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+      D.FreePropagator(src4,result4,mass,false);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+}
+
+template<class Action>
+void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{
+  GridBase *UGrid = D.GaugeGrid();
+  GridBase *FGrid = D.FermionGrid();
+
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion src5  (FGrid); 
+  LatticeFermion result5(FGrid);
+  LatticeFermion result4(UGrid);
+  LatticePropagator prop5(FGrid);
+  
+  ConjugateGradient<LatticeFermion> CG(1.0e-10,100000);
+  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
+   for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+
+      result5=Zero();
+      schur(D,src5,result5,ZG);
+      std::cout<<GridLogMessage
+	       <<"spin "<<s<<" color "<<c
+	       <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(prop5,result5,s,c);
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+  LatticePropagator Axial_mu(UGrid); 
+  LatticePropagator Vector_mu(UGrid); 
+
+  LatticeComplex    PA (UGrid); 
+  LatticeComplex    VV (UGrid); 
+  LatticeComplex    PJ5q(UGrid);
+  LatticeComplex    PP (UGrid);
+
+  std::vector<TComplex> sumPA;
+  std::vector<TComplex> sumVV;
+  std::vector<TComplex> sumPP;
+  std::vector<TComplex> sumPJ5q;
+
+  Gamma g5(Gamma::Algebra::Gamma5);
+  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
+  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
+  sliceSum(PA,sumPA,Tdir);
+
+  int Nt{static_cast<int>(sumPA.size())};
+
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
+
+  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
+  sliceSum(PP,sumPP,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
+  
+  D.ContractJ5q(prop5,PJ5q);
+  sliceSum(PJ5q,sumPJ5q,Tdir);
+  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
+
+  Gamma::Algebra GammaV[3] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ
+  };
+  for( int mu=0;mu<3;mu++ ) {
+    Gamma gV(GammaV[mu]);
+    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
+    //    auto ss=sliceSum(Vector_mu,Tdir);
+    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
+    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
+    sliceSum(VV,sumVV,Tdir);
+    for(int t=0;t<Nt;t++){
+      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
+               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
+    }
+  }
+
+}
+
+class MesonFile: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
+};
+
+void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
+    {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
+    {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
+    {Gamma::Algebra::Identity,Gamma::Algebra::Identity}
+  };
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
+      RealD Ct = real(corr[t]);
+      RealD Cont=0;
+      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
+		<< " deltaC " <<Ct-Cont<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+int main (int argc, char ** argv)
+{
+  const int Ls=10;
+
+  Grid_init(&argc,&argv);
+
+  // Double precision grids
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  //  std::vector<int> seeds4({1,2,3,4}); 
+  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeGaugeField Umu(UGrid);
+  std::string config;
+  RealD M5=atof(getenv("M5"));
+  RealD mq = atof(getenv("mass"));
+  int   tadpole = atof(getenv("tadpole"));
+  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
+  if( argc > 1 && argv[1][0] != '-' )
+  {
+    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, argv[1]);
+    config=argv[1];
+    LLscale = 1.0;
+    LCscale = 1.0;
+  }
+  else
+  {
+    SU<Nc>::ColdConfiguration(Umu);
+    config="ColdConfig";
+    //    RealD P=1.0; // Don't scale
+    //    RealD P=0.6388238 // 32Ifine
+    //    RealD P=0.6153342; // 64I
+    RealD P=0.5871119; // 48I
+    RealD u0 = sqrt(sqrt(P));
+    RealD w0 = 1 - M5;
+    std::cout<<GridLogMessage <<"For plaquette P="<<P<<" u0= "<<u0<<std::endl;
+    if ( tadpole == 1 ) {
+      Umu = Umu * u0;
+      //      LLscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
+      //      LCscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
+      LLscale = 1.0;
+      LCscale = 1.0;
+      std::cout<<GridLogMessage <<"Gauge links are u= u0 "<<std::endl;
+      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
+    } else if ( tadpole == 2) {
+      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
+      LLscale = 1.0;
+      LCscale = 1.0;
+      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
+    } else {
+      LLscale = 1.0/u0/u0;
+      LCscale = 1.0/u0/u0;
+      M5 = M5 - 4.0 * (1-u0);
+      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
+      std::cout<<GridLogMessage <<"M5mf =  "<<M5<<std::endl;
+    }
+    std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl;
+    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
+    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
+  }
+
+  int nmass = masses.size();
+
+  typedef DomainWallFermionR FermionActionR;
+  //  typedef MobiusFermionR FermionActionR;
+  std::vector<FermionActionR *> FermActs;
+  std::vector<DomainWallFermionR *> DWFActs;
+  
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+
+  for(auto mass: masses) {
+    std::vector<Complex> boundary = {1,1,1,-1};
+    FermionActionR::ImplParams Params(boundary);
+    RealD b=1.5;
+    RealD c=0.5;
+    std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
+    //    DWFActs.push_back(new DomainWallFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
+    FermActs.push_back(new FermionActionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params));
+    //    FermActs.push_back(new FermionActionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass+0.001,M5,b,c));
+    std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
+  }
+
+  LatticePropagator point_source(UGrid);
+
+  Coordinate Origin({0,0,0,0});
+  PointSource   (Origin,point_source);
+  
+  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  //  std::vector<LatticePropagator> FreeProps(nmass,UGrid);
+  //  LatticePropagator delta(UGrid);
+
+  for(int m=0;m<nmass;m++) {
+    Solve(*FermActs[m],point_source   ,PointProps[m]);
+    //    MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]);
+
+    //    delta = PointProps[m] - FreeProps[m];
+    //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
+  }
+
+  LatticeComplex phase(UGrid);
+  Coordinate mom({0,0,0,0});
+  MakePhase(mom,phase);
+  
+  for(int m1=0 ;m1<nmass;m1++) {
+  for(int m2=m1;m2<nmass;m2++) {
+    std::stringstream ssp,ssg,ssz;
+
+    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
+
+    std::cout << "CG determined VV correlation function"<<std::endl;
+    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
+    
+    //    std::cout << "FFT derived VV correlation function"<<std::endl;
+    //    MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
+  }}
+
+  Grid_finalize();
+}
+
+
+
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@ -95,26 +95,34 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  double t1,t2,flops;
+  double MdagMsiteflops = 1452; // Mobius (real coeffs)
+  // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
+  double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
+  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
+  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
  int iters;
-  for(int i=0;i<100;i++){
+  for(int i=0;i<200;i++){
    result_o = Zero();
    t1=usecond();
    mCG(src_o,result_o);
    t2=usecond();
    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
-    flops = 1320.0*2*FGrid->gSites()*iters;
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
+    flops+= CGsiteflops*FrbGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  for(int i=0;i<100;i++){
+  for(int i=0;i<1;i++){
    result_o_2 = Zero();
    t1=usecond();
    CG(HermOpEO,src_o,result_o_2);
    t2=usecond();
    iters = CG.IterationsToComplete;
-    flops = 1320.0*2*FGrid->gSites()*iters;
+    flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; 
+    flops+= CGsiteflops*FrbGrid->gSites()*iters;
+    
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
Author	SHA1	Message	Date
Peter Boyle	e7d9b75fdd	Warning fixes	2022-08-31 19:01:14 -04:00
Peter Boyle	3d0e3ec363	Tracing	2022-08-31 18:31:46 -04:00
Peter Boyle	3c1c51f9aa	Merge branch 'feature/dirichlet-gparity' into feature/dirichlet	2022-08-31 18:25:34 -04:00
Peter Boyle	5c87342108	Used in g-2 sign off	2022-08-31 17:35:32 -04:00
Peter Boyle	66177bfbe2	Used in g-2 sign off	2022-08-31 17:35:07 -04:00
Peter Boyle	5205e68963	RocTX, NVTX, text based self profiling	2022-08-31 17:34:09 -04:00
Peter Boyle	cd5cf6d614	Tracing replaces self timing hooks	2022-08-31 17:33:41 -04:00
Peter Boyle	5abb19eab0	Remove self timing	2022-08-31 17:32:49 -04:00
Peter Boyle	06d7b88c78	Force reporting improved	2022-08-31 17:32:21 -04:00
Peter Boyle	cf72799735	Better action naming	2022-08-31 17:24:11 -04:00
Peter Boyle	cdb8fcc269	Width=4 support. This is too broad; hit it on physical point run. Need to change strategy, I think.	2022-08-31 17:21:33 -04:00
Peter Boyle	b4f4130901	Defer SMP node links until after interior. Allows for DMA overlapping compute	2022-08-31 17:20:21 -04:00
Peter Boyle	bb049847d5	Tracing replaces self timing	2022-08-31 17:19:02 -04:00
Peter Boyle	fd33c835dd	Feynman rule fix and tracing replaces self timing	2022-08-31 17:18:17 -04:00
Peter Boyle	21371a7e5b	Tracing replaces self timing	2022-08-31 17:16:05 -04:00
Peter Boyle	abfaa00d3e	Tracing replaces self timing	2022-08-31 17:15:24 -04:00
Peter Boyle	efee33c55d	Tracing replaces self timing	2022-08-31 17:14:57 -04:00
Peter Boyle	db0fe6ddbb	Tracing replaces self timinng	2022-08-31 17:14:14 -04:00
Peter Boyle	8a9e647120	Tracing replaces self timing	2022-08-31 17:13:44 -04:00
Peter Boyle	e6dcb821ad	Tracing replaces self timing	2022-08-31 17:12:31 -04:00
Peter Boyle	9bff188f02	Tracing replaces self timing	2022-08-31 17:12:05 -04:00
Peter Boyle	111b30ca1d	Tracing replaces self timing	2022-08-31 17:11:48 -04:00
Peter Boyle	24182ca8bf	HIP allows conserved currents. Tracing replaces self timeing	2022-08-31 17:11:18 -04:00
Peter Boyle	ee2d7369b3	Tracing replaces self timing	2022-08-31 17:10:45 -04:00
Peter Boyle	7c686d29c9	Tracing replaces self timing	2022-08-31 17:10:17 -04:00
Peter Boyle	e8a0a1e75d	Tracing replaces self timing hooks	2022-08-31 17:09:47 -04:00
Peter Boyle	730be89abf	Remove timing hooks as tracing replaces	2022-08-31 17:08:44 -04:00
Peter Boyle	f991ad7d5c	Remove timing hooks as tracing replaces	2022-08-31 17:08:18 -04:00
Peter Boyle	b3f33f82f7	Decrease self timing hooks, use nvtx / roctx type tracing hooks instead	2022-08-31 17:06:47 -04:00
Peter Boyle	a34a6e059f	Logging improvement. Sinitial will be used to improve RHMC terms	2022-08-31 17:06:08 -04:00
Peter Boyle	1333319941	Tracing	2022-08-31 17:00:25 -04:00
Peter Boyle	9295ed8d20	Print full memory range	2022-08-31 16:59:51 -04:00
Peter Boyle	19cc7653fb	Tracing	2022-08-31 16:57:51 -04:00
Peter Boyle	5752538661	Tracing	2022-08-31 16:57:32 -04:00
Peter Boyle	ca40a1b00b	Tracing	2022-08-31 16:54:55 -04:00
Peter Boyle	659fac9dfb	Tracing hook	2022-08-31 16:54:25 -04:00
Peter Boyle	4dc3d6fce0	Buy into Nvidia/Rocm etc... tracing.	2022-08-31 16:53:19 -04:00
Peter Boyle	95b640cb6b	10TF/s on 32^3 x 64 on single node	2022-08-04 15:43:52 -04:00
Peter Boyle	2cb5bedc15	Copy stream HIP improvements	2022-08-04 15:24:03 -04:00
Peter Boyle	806b02bddf	Simplify dead code	2022-08-04 15:23:13 -04:00
Peter Boyle	de40395773	More timing. Think I should start to use nvtx and rocmtx ??	2022-08-04 13:37:16 -04:00
Peter Boyle	7ba4788715	Fix	2022-08-04 13:36:44 -04:00
Peter Boyle	06d9ce1a02	Synch ranks on node here for GPU - GPU memcopy	2022-08-04 13:35:56 -04:00
Peter Boyle	75bb6b2b40	Move barrier into the StencilSend begin routine	2022-08-04 13:35:26 -04:00
Peter Boyle	74f10c2dc0	Move barrier into Stencil Send	2022-08-04 13:34:11 -04:00
Peter Boyle	a93d5459d4	Better mpi request completion	2022-07-28 12:18:35 -04:00
Peter Boyle	9c21add0c6	High res timer replaces getttimeofday	2022-07-28 12:14:03 -04:00
Peter Boyle	639aab6563	High res timer instead of gettimeofday	2022-07-28 12:13:35 -04:00
Peter Boyle	8137cc7049	Allways concurrent comms	2022-07-28 12:01:51 -04:00
Peter Boyle	60e63dca1d	Add memory logging channel	2022-07-28 11:39:15 -04:00
Peter Boyle	486409574e	Expanded cach to avoid any allocs in HMC	2022-07-28 11:38:34 -04:00
Peter Boyle	a913b8be12	Dslash self timing. Might want to not have this	2022-07-28 11:37:55 -04:00
Peter Boyle	2239751850	Better logging	2022-07-28 11:37:36 -04:00
Peter Boyle	9b20f1449c	Better timing	2022-07-28 11:37:12 -04:00
Peter Boyle	b99453083d	Updated timing	2022-07-28 11:37:02 -04:00
Peter Boyle	943fbb914d	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet	2022-07-11 13:48:42 -04:00
Peter Boyle	ca4603580d	Verbose	2022-07-11 13:48:35 -04:00
Peter Boyle	f73db8f1f3	Synch clocks	2022-07-11 13:47:39 -04:00
Peter Boyle	f7217d12d2	World barrier for clock synch	2022-07-11 13:45:31 -04:00
Peter Boyle	fab50c57d9	More loggin	2022-07-11 18:42:27 +01:00
Peter Boyle	3440534fbf	MixedPrec support	2022-07-10 21:35:18 +01:00
Peter Boyle	177b1a7ec6	Mixed prec	2022-07-10 21:34:10 +01:00
Peter Boyle	58182fe345	Different approach to default dirichlet params	2022-07-10 21:32:58 +01:00
Peter Boyle	1f907d330d	Different default params for dirichlet	2022-07-10 21:31:48 +01:00
Peter Boyle	b0fe664e9d	Better force log info	2022-07-10 21:31:25 +01:00
Peter Boyle	c0f8482402	Remove SSC marks	2022-07-07 17:49:36 +01:00
Peter Boyle	3544965f54	Stream doesn't work	2022-07-07 17:49:20 +01:00