Merge branch 'develop' into gparity_HMC_merge_develop

2026-07-20 18:43:27 +01:00 · 2022-02-22 14:25:27 -05:00
parent ba974960e6 63dbaeefaa
commit deac621c2c
99 changed files with 4060 additions and 656 deletions
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
  virtual ~LinearOperatorBase(){};
 };
@@ -507,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@@ -585,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -598,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+template<class Field> using Preconditioner =  LinearFunction<Field> ;
 /*
 template<class Field> class Preconditioner :  public LinearFunction<Field> {
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
 */
 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
@@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
  virtual ~SparseMatrixBase() {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };
 NAMESPACE_END(Grid);
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
@@ -54,15 +57,24 @@ class DeflatedGuesser: public LinearFunction<Field> {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
-  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
  {}
  DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
  : evec(_evec), eval(_eval), N(_N)
  {
    assert(evec.size()==eval.size());
    assert(N <= evec.size());
  } 
  virtual void operator()(const Field &src,Field &guess) {
    guess = Zero();
    assert(evec.size()==eval.size());
    auto N = evec.size();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
@@ -79,6 +91,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
@@ -68,6 +68,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -98,6 +99,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;
@@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
@@ -170,7 +171,7 @@ public:
    LinalgTimer.Start();
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    //p[0],q[0],qq[0] 
@@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);
      LinalgTimer.Start();
@@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
-  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
+  std::cout << " MemoryManager : PrintBytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
  uint64_t cacheBytes;
  cacheBytes = CacheBytes[Cpu];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Acc];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Shared];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
 }
 //////////////////////////////////////////////////////////////////////
@@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
-
+uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
  total_device+=bytes;
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
  total_device-=bytes;
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
    total_device-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorFree "<<std::endl;
  PrintBytes();
 #endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
  total_shared+=bytes;
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
  total_shared-=bytes;
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
    total_shared-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #endif
@@ -115,7 +159,6 @@ void MemoryManager::Init(void)
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
@@ -181,13 +224,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
 #endif
 }
-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -211,6 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  if ( entries[v].valid ) {
    ret = entries[v].address;
    cacheBytes -= entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@@ -219,6 +263,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
  cacheBytes += bytes;
  return ret;
 }
@@ -228,13 +273,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
 #endif
 }
-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -243,6 +288,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
      cacheBytes -= entries[e].bytes;
      return entries[e].address;
    }
  }
@@ -82,14 +82,15 @@ private:
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
  static uint64_t CacheBytes[NallocType];
  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
  static void PrintBytes(void);
 public:
@@ -169,6 +170,7 @@ private:
 public:
  static void Print(void);
  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
@@ -3,7 +3,7 @@
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)
@@ -429,6 +429,7 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
  PrintBytes();
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
@@ -473,6 +474,32 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
  }
 }
 void MemoryManager::PrintState(void* _CpuPtr)
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if ( EntryPresent(CpuPtr) ){
    auto AccCacheIterator = EntryLookup(CpuPtr);
    auto & AccCache = AccCacheIterator->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
    << "\t" << AccCache.cpuLock
    << "\t" << AccCache.accLock
    << "\t" << AccCache.LRU_valid<<std::endl;
  } else {
    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
  }
 }
 NAMESPACE_END(Grid);
 #endif
@@ -16,6 +16,10 @@ uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 void  MemoryManager::PrintState(void* CpuPtr)
 {
 std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 };
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};
@@ -388,8 +388,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    // TODO : make a OMP loop on CPU, call threaded bcopy
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    acceleratorCopySynchronise(); // MPI prob slower
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -400,6 +400,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
  int nreq=list.size();
  if (nreq==0) return;
@@ -88,6 +88,13 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
  // Helper function to print the state of this object in the AccCache
  void PrintCacheState(void)
  {
    MemoryManager::PrintState(this->_odata);
  }
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
@@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
  if (warpSize != WARP_SIZE) {
@@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
    exit(EXIT_FAILURE);
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
@@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
  autoView(full_v, full, AcceleratorRead);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++) {
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(half_v[ssh],full_v(ss));
    }
  });
 }
 template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
  autoView(full_v , full, AcceleratorWrite);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++){
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(full_v[ss],half_v(ssh));
    }
  });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
@@ -576,6 +576,8 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
  // in principle should do the line below, but that breaks backard compatibility with old data
  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
@@ -0,0 +1,240 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
    Copyright (C) 2020 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer <nils.meyer@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 // see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
 //
 // Modifications done here:
 //
 // Original: clover term = 12x12 matrix per site
 //
 // But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
 // Sufficient to store/transfer only the real parts of the diagonal and one triangular part
 // 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
 //
 // Here: Above but diagonal as complex numbers, i.e., need to store/transfer
 // 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
 //
 // Words per site and improvement compared to original (combined with the input and output spinors):
 //
 // - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
 // - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
 // - Here:     2*12 + 42    =  66 words -> 2.55 x less
 //
 // These improvements directly translate to wall-clock time
 //
 // Data layout:
 //
 // - diagonal and triangle part as separate lattice fields,
 //   this was faster than as 1 combined field on all tested machines
 // - diagonal: as expected
 // - triangle: store upper right triangle in row major order
 // - graphical:
 //        0  1  2  3  4
 //           5  6  7  8
 //              9 10 11 = upper right triangle indices
 //                12 13
 //                   14
 //     0
 //        1
 //           2
 //              3       = diagonal indices
 //                 4
 //                    5
 //     0
 //     1  5
 //     2  6  9          = lower left triangle indices
 //     3  7 10 12
 //     4  8 11 13 14
 //
 // Impact on total memory consumption:
 // - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
 // - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
 template<class Impl>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
  /////////////////////////////////////////////
  // Sizes
  /////////////////////////////////////////////
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  /////////////////////////////////////////////
  // Type definitions
  /////////////////////////////////////////////
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonFermion<Impl>              WilsonBase;
  typedef WilsonCloverHelpers<Impl>        Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  /////////////////////////////////////////////
  // Constructors
  /////////////////////////////////////////////
 public:
  CompactWilsonCloverFermion(GaugeField& _Umu,
 			    GridCartesian& Fgrid,
 			    GridRedBlackCartesian& Hgrid,
 			    const RealD _mass,
 			    const RealD _csw_r = 0.0,
 			    const RealD _csw_t = 0.0,
 			    const RealD _cF = 1.0,
 			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
 			    const ImplParams& impl_p = ImplParams());
  /////////////////////////////////////////////
  // Member functions (implementing interface)
  /////////////////////////////////////////////
 public:
  virtual void Instantiatable() {};
  int          ConstEE()     override { return 0; };
  int          isTrivialEE() override { return 0; };
  void Dhop(const FermionField& in, FermionField& out, int dag) override;
  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
  void M(const FermionField& in, FermionField& out) override;
  void Mdag(const FermionField& in, FermionField& out) override;
  void Meooe(const FermionField& in, FermionField& out) override;
  void MeooeDag(const FermionField& in, FermionField& out) override;
  void Mooee(const FermionField& in, FermionField& out) override;
  void MooeeDag(const FermionField& in, FermionField& out) override;
  void MooeeInv(const FermionField& in, FermionField& out) override;
  void MooeeInvDag(const FermionField& in, FermionField& out) override;
  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  /////////////////////////////////////////////
  // Member functions (internals)
  /////////////////////////////////////////////
  void MooeeInternal(const FermionField&        in,
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle);
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
  void ImportGauge(const GaugeField& _Umu) override;
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
 private:
  template<class Field>
  const MaskField* getCorrectMaskField(const Field &in) const {
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        return &this->BoundaryMaskOdd;
      } else {
        return &this->BoundaryMaskEven;
      }
    } else {
      return &this->BoundaryMask;
    }
  }
  template<class Field>
  void ApplyBoundaryMask(Field& f) {
    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
    assert(m != nullptr);
    CompactHelpers::ApplyBoundaryMask(f, *m);
  }
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
 public:
  RealD csw_r;
  RealD csw_t;
  RealD cF;
  bool open_boundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
  FermionField Tmp;
  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
 };
 NAMESPACE_END(Grid);
@@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -153,6 +154,23 @@ typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoInd
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
 typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
 typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
 typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
 typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
@@ -4,10 +4,11 @@
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -29,7 +30,8 @@
 #pragma once
-#include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@@ -50,18 +52,15 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////
 template <class Impl>
-class WilsonCloverFermion : public WilsonFermion<Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  template <typename vtype>
+  INHERIT_CLOVER_TYPES(Impl);
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
-public:
+  typedef WilsonFermion<Impl>       WilsonBase;
-  typedef WilsonFermion<Impl> WilsonBase;
+  typedef WilsonCloverHelpers<Impl> Helpers;
  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@@ -72,42 +71,7 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                      const ImplParams &impl_p = ImplParams());
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@@ -124,250 +88,21 @@ public:
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
  {
    conformable(X.Grid(), Y.Grid());
    conformable(X.Grid(), force.Grid());
    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
    GaugeField clover_force(force.Grid());
    PropagatorField Lambda(force.Grid());
-    // Guido: Here we are hitting some performance issues:
+public:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
    Impl::extractLinkField(U, this->Umu);
    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 protected:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverField CloverTerm, CloverTermInv;                     // Clover term
-  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
-  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
-  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
-  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
      T_v[i]()(2, 3) = -F_v[i]()();
      T_v[i]()(3, 2) = F_v[i]()();
    });
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
      T_v[i]()(2, 3) = (F_v[i]()());
      T_v[i]()(3, 2) = -(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,761 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 // Helper routines that implement common clover functionality
 NAMESPACE_BEGIN(Grid);
 template<class Impl> class WilsonCloverHelpers {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
  static CloverField fillCloverYZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
    });
    return T;
  }
  static CloverField fillCloverXY(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverYT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverZT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  template<class _Spinor>
  static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
    auto CC = coalescedRead(C);
    mult(&phi, &CC, &chi);
  }
  template<class _SpinorField>
  inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
    const int Nsimd = SiteSpinor::Nsimd();
    autoView(out_v, out, AcceleratorWrite);
    autoView(phi_v, phi, AcceleratorRead);
    autoView(C_v,   C,   AcceleratorRead);
    typedef decltype(coalescedRead(out_v[0])) calcSpinor;
    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
      calcSpinor tmp;
      multClover(tmp,C_v[sss],phi_v(sss));
      coalescedWrite(out_v[sss],tmp);
    });
  }
 };
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  #if 0
  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #else
  template<typename vobj>
  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #endif
  static accelerator_inline int triangle_index(int i, int j) {
    if(i == j)
      return 0;
    else if(i < j)
      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
    else // i > j
      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
  }
  static void MooeeKernel_gpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(in_v,       in,       AcceleratorRead);
    autoView(out_v,      out,      AcceleratorWrite);
    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
    const uint64_t NN = Nsite * Ls;
    accelerator_for(ss, NN, Simd::Nsimd(), {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v(sF);
      auto diagonal_t = diagonal_v(sU);
      auto triangle_t = triangle_v(sU);
      for(int block=0; block<Nhs; block++) {
        int s_start = block*Nhs;
        for(int i=0; i<Nred; i++) {
          int si = s_start + i/Nc, ci = i%Nc;
          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
          for(int j=0; j<Nred; j++) {
            if (j == i) continue;
            int sj = s_start + j/Nc, cj = j%Nc;
            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
          };
        };
      };
      coalescedWrite(out_v[sF], res);
    });
  }
  static void MooeeKernel_cpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, CpuRead);
    autoView(triangle_v, triangle, CpuRead);
    autoView(in_v,       in,       CpuRead);
    autoView(out_v,      out,      CpuWrite);
    typedef SiteSpinor CalcSpinor;
 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
 #define PREFETCH_CLOVER(BASE) {                                     \
    uint64_t base;                                                  \
    int pf_dist_L1 = 1;                                             \
    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
                                                                    \
    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
    }                                                               \
                                                                    \
    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
    }                                                               \
  }
 // TODO: Implement/generalize this for other architectures
 // I played around a bit on KNL (see below) but didn't bring anything
 // #elif defined(AVX512)
 // #define PREFETCH_CLOVER(BASE) {                              \
 //     uint64_t base;                                           \
 //     int pf_dist_L1 = 1;                                      \
 //     int pf_dist_L2 = +4;                                     \
 //                                                              \
 //     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
 //     }                                                        \
 //                                                              \
 //     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
 //     }                                                        \
 //   }
 #else
 #define PREFETCH_CLOVER(BASE)
 #endif
    const uint64_t NN = Nsite * Ls;
    thread_for(ss, NN, {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v[sF];
      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
      auto triangle_t = triangle_v[sU];
      // upper half
      PREFETCH_CLOVER(0);
      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
      auto in_cc_1_0 = conjugate(in_t()(1)(0));
      auto in_cc_1_1 = conjugate(in_t()(1)(1));
      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
                  +           triangle_t()(0)( 0) * in_t()(0)(1)
                  +           triangle_t()(0)( 1) * in_t()(0)(2)
                  +           triangle_t()(0)( 2) * in_t()(1)(0)
                  +           triangle_t()(0)( 3) * in_t()(1)(1)
                  +           triangle_t()(0)( 4) * in_t()(1)(2);
      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
                  +           triangle_t()(0)( 5) * in_t()(0)(2)
                  +           triangle_t()(0)( 6) * in_t()(1)(0)
                  +           triangle_t()(0)( 7) * in_t()(1)(1)
                  +           triangle_t()(0)( 8) * in_t()(1)(2)
                  + conjugate(       res()(0)( 1));
      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
                  +           triangle_t()(0)( 5) * in_cc_0_1;
      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
                  +           triangle_t()(0)( 9) * in_t()(1)(0)
                  +           triangle_t()(0)(10) * in_t()(1)(1)
                  +           triangle_t()(0)(11) * in_t()(1)(2)
                  + conjugate(       res()(0)( 2));
      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
                  +           triangle_t()(0)( 6) * in_cc_0_1
                  +           triangle_t()(0)( 9) * in_cc_0_2;
      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
                  +           triangle_t()(0)(12) * in_t()(1)(1)
                  +           triangle_t()(0)(13) * in_t()(1)(2)
                  + conjugate(       res()(1)( 0));
      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
                  +           triangle_t()(0)( 7) * in_cc_0_1
                  +           triangle_t()(0)(10) * in_cc_0_2
                  +           triangle_t()(0)(12) * in_cc_1_0;
      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
                  +           triangle_t()(0)(14) * in_t()(1)(2)
                  + conjugate(       res()(1)( 1));
      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
                  +           triangle_t()(0)( 8) * in_cc_0_1
                  +           triangle_t()(0)(11) * in_cc_0_2
                  +           triangle_t()(0)(13) * in_cc_1_0
                  +           triangle_t()(0)(14) * in_cc_1_1;
      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
                  + conjugate(       res()(1)( 2));
      vstream(out_v[sF]()(0)(0), res()(0)(0));
      vstream(out_v[sF]()(0)(1), res()(0)(1));
      vstream(out_v[sF]()(0)(2), res()(0)(2));
      vstream(out_v[sF]()(1)(0), res()(1)(0));
      vstream(out_v[sF]()(1)(1), res()(1)(1));
      vstream(out_v[sF]()(1)(2), res()(1)(2));
      // lower half
      PREFETCH_CLOVER(1);
      auto in_cc_2_0 = conjugate(in_t()(2)(0));
      auto in_cc_2_1 = conjugate(in_t()(2)(1));
      auto in_cc_2_2 = conjugate(in_t()(2)(2));
      auto in_cc_3_0 = conjugate(in_t()(3)(0));
      auto in_cc_3_1 = conjugate(in_t()(3)(1));
      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
                  +           triangle_t()(1)( 0) * in_t()(2)(1)
                  +           triangle_t()(1)( 1) * in_t()(2)(2)
                  +           triangle_t()(1)( 2) * in_t()(3)(0)
                  +           triangle_t()(1)( 3) * in_t()(3)(1)
                  +           triangle_t()(1)( 4) * in_t()(3)(2);
      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
                  +           triangle_t()(1)( 5) * in_t()(2)(2)
                  +           triangle_t()(1)( 6) * in_t()(3)(0)
                  +           triangle_t()(1)( 7) * in_t()(3)(1)
                  +           triangle_t()(1)( 8) * in_t()(3)(2)
                  + conjugate(       res()(2)( 1));
      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
                  +           triangle_t()(1)( 5) * in_cc_2_1;
      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
                  +           triangle_t()(1)( 9) * in_t()(3)(0)
                  +           triangle_t()(1)(10) * in_t()(3)(1)
                  +           triangle_t()(1)(11) * in_t()(3)(2)
                  + conjugate(       res()(2)( 2));
      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
                  +           triangle_t()(1)( 6) * in_cc_2_1
                  +           triangle_t()(1)( 9) * in_cc_2_2;
      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
                  +           triangle_t()(1)(12) * in_t()(3)(1)
                  +           triangle_t()(1)(13) * in_t()(3)(2)
                  + conjugate(       res()(3)( 0));
      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
                  +           triangle_t()(1)( 7) * in_cc_2_1
                  +           triangle_t()(1)(10) * in_cc_2_2
                  +           triangle_t()(1)(12) * in_cc_3_0;
      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
                  +           triangle_t()(1)(14) * in_t()(3)(2)
                  + conjugate(       res()(3)( 1));
      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
                  +           triangle_t()(1)( 8) * in_cc_2_1
                  +           triangle_t()(1)(11) * in_cc_2_2
                  +           triangle_t()(1)(13) * in_cc_3_0
                  +           triangle_t()(1)(14) * in_cc_3_1;
      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
                  + conjugate(       res()(3)( 2));
      vstream(out_v[sF]()(2)(0), res()(2)(0));
      vstream(out_v[sF]()(2)(1), res()(2)(1));
      vstream(out_v[sF]()(2)(2), res()(2)(2));
      vstream(out_v[sF]()(3)(0), res()(3)(0));
      vstream(out_v[sF]()(3)(1), res()(3)(1));
      vstream(out_v[sF]()(3)(2), res()(3)(2));
    });
  }
  static void MooeeKernel(int                        Nsite,
                          int                        Ls,
                          const FermionField&        in,
                          FermionField&              out,
                          const CloverDiagonalField& diagonal,
                          const CloverTriangleField& triangle) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)
    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
 #else
    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
 #endif
  }
  static void Invert(const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle,
                     CloverDiagonalField&       diagonalInv,
                     CloverTriangleField&       triangleInv) {
    conformable(diagonal, diagonalInv);
    conformable(triangle, triangleInv);
    conformable(diagonal, triangle);
    diagonalInv.Checkerboard() = diagonal.Checkerboard();
    triangleInv.Checkerboard() = triangle.Checkerboard();
    GridBase* grid = diagonal.Grid();
    long lsites = grid->lSites();
    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
    autoView(diagonal_v,  diagonal,  CpuRead);
    autoView(triangle_v,  triangle,  CpuRead);
    autoView(diagonalInv_v, diagonalInv, CpuWrite);
    autoView(triangleInv_v, triangleInv, CpuWrite);
    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      scalar_object_diagonal diagonal_tmp     = Zero();
      scalar_object_diagonal diagonal_inv_tmp = Zero();
      scalar_object_triangle triangle_tmp     = Zero();
      scalar_object_triangle triangle_inv_tmp = Zero();
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
      peekLocalSite(triangle_tmp, triangle_v, lcoor);
      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
              else
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
            }
          }
        }
      }
      clover_inv_eigen = clover_eigen.inverse();
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else if(i < j)
                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else
                continue;
            }
          }
        }
      }
      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
    });
  }
  static void ConvertLayout(const CloverField&   full,
                            CloverDiagonalField& diagonal,
                            CloverTriangleField& triangle) {
    conformable(full, diagonal);
    conformable(full, triangle);
    diagonal.Checkerboard() = full.Checkerboard();
    triangle.Checkerboard() = full.Checkerboard();
    autoView(full_v,     full,     AcceleratorRead);
    autoView(diagonal_v, diagonal, AcceleratorWrite);
    autoView(triangle_v, triangle, AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else if(i < j)
                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else
                continue;
            }
          }
        }
      }
    });
  }
  static void ConvertLayout(const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverField&               full) {
    conformable(full, diagonal);
    conformable(full, triangle);
    full.Checkerboard() = diagonal.Checkerboard();
    full = Zero();
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(full_v,     full,     AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
              else
                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
            }
          }
        }
      }
    });
  }
  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
    // Checks/grid
    double t0 = usecond();
    conformable(diagonal, triangle);
    GridBase* grid = diagonal.Grid();
    // Determine the boundary coordinates/sites
    double t1 = usecond();
    int t_dir = Nd - 1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    // Set off-diagonal parts at boundary to zero -- OK
    double t2 = usecond();
    CloverTriangleField zeroTriangle(grid);
    zeroTriangle.Checkerboard() = triangle.Checkerboard();
    zeroTriangle = Zero();
    triangle = where(t_coor == 0,   zeroTriangle, triangle);
    triangle = where(t_coor == T-1, zeroTriangle, triangle);
    // Set diagonal to unity (scaled correctly) -- OK
    double t3 = usecond();
    CloverDiagonalField tmp(grid);
    tmp.Checkerboard() = diagonal.Checkerboard();
    tmp                = -1.0 * csw_t + diag_mass;
    diagonal           = where(t_coor == 0,   tmp, diagonal);
    diagonal           = where(t_coor == T-1, tmp, diagonal);
    // Correct values next to boundary
    double t4 = usecond();
    if(cF != 1.0) {
      tmp = cF - 1.0;
      tmp += diagonal;
      diagonal = where(t_coor == 1,   tmp, diagonal);
      diagonal = where(t_coor == T-2, tmp, diagonal);
    }
    // Report timings
    double t5 = usecond();
 #if 0
    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
              << " checks = "          << (t1 - t0) / 1e6
              << ", coordinate = "     << (t2 - t1) / 1e6
              << ", off-diag zero = "  << (t3 - t2) / 1e6
              << ", diagonal unity = " << (t4 - t3) / 1e6
              << ", near-boundary = "  << (t5 - t4) / 1e6
              << ", total = "          << (t5 - t0) / 1e6
              << std::endl;
 #endif
  }
  template<class Field, class Mask>
  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
    conformable(f, m);
    auto grid  = f.Grid();
    const uint32_t Nsite = grid->oSites();
    const uint32_t Nsimd = grid->Nsimd();
    autoView(f_v, f, AcceleratorWrite);
    autoView(m_v, m, AcceleratorRead);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, Nsite, Nsimd, {
      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
    });
  }
  template<class MaskField>
  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
    assert(!full.Grid()->_isCheckerBoarded);
    GridBase* grid = full.Grid();
    int t_dir = Nd-1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    MaskField zeroMask(grid); zeroMask = Zero();
    full = 1.0;
    full = where(t_coor == 0,   zeroMask, full);
    full = where(t_coor == T-1, zeroMask, full);
    pickCheckerboard(Even, even, full);
    pickCheckerboard(Odd,  odd,  full);
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,92 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 class WilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteClover;
  typedef Lattice<SiteClover> CloverField;
 };
 template<class Impl>
 class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
  typedef iSinglet<Simd>            SiteMask;
  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
  typedef Lattice<SiteMask>           MaskField;
 };
 #define INHERIT_CLOVER_TYPES(Impl)                                 \
  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
 #define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
  /* ugly duplication but needed inside functionality classes */ \
  template<typename vtype> using iImplCloverDiagonal = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
  template<typename vtype> using iImplCloverTriangle = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
 #define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
 NAMESPACE_END(Grid);
@@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
      unsigned int t0 = 0;
      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
    } else {
      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    InsertSlice(L_Q, q_out, s , 0);
@@ -0,0 +1,363 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
                                                             GridCartesian& Fgrid,
                                                             GridRedBlackCartesian& Hgrid,
                                                             const RealD _mass,
                                                             const RealD _csw_r,
                                                             const RealD _csw_t,
                                                             const RealD _cF,
                                                             const WilsonAnisotropyCoefficients& clover_anisotropy,
                                                             const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
  , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
  , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
  , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
  , Tmp(&Fgrid)
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
  if (open_boundaries)
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
    } else {
      MooeeInternal(in, out, DiagonalEven, TriangleEven);
    }
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
    } else {
      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
    }
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!open_boundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
        continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  out.Checkerboard() = in.Checkerboard();
  conformable(in, out);
  conformable(in, diagonal);
  conformable(in, triangle);
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
  double t0 = usecond();
  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
  // Initialize temporary variables
  double t1 = usecond();
  conformable(_Umu.Grid(), this->GaugeGrid());
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  double t3 = usecond();
  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
  TmpOriginal += this->diag_mass;
  // Convert the data layout of the clover term
  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Possible modify the boundary values
  double t5 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the clover term in the improved layout
  double t6 = usecond();
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
  // Fill the remaining clover fields
  double t7 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
  double t8 = usecond();
 #if 0
  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
            << ", allocations = "               << (t2 - t1) / 1e6
            << ", field strength = "            << (t3 - t2) / 1e6
            << ", fill clover = "               << (t4 - t3) / 1e6
            << ", convert = "                   << (t5 - t4) / 1e6
            << ", boundaries = "                << (t6 - t5) / 1e6
            << ", inversions = "                << (t7 - t6) / 1e6
            << ", pick cbs = "                  << (t8 - t7) / 1e6
            << ", total = "                     << (t8 - t0) / 1e6
            << std::endl;
 #endif
 }
 NAMESPACE_END(Grid);
@@ -2,12 +2,13 @@
    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,6 +34,45 @@
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
                                               const RealD                         _csw_r,
                                               const RealD                         _csw_t,
                                               const WilsonAnisotropyCoefficients& clover_anisotropy,
                                               const ImplParams&                   impl_p)
  : WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , CloverTerm(&Fgrid)
  , CloverTermInv(&Fgrid)
  , CloverTermEven(&Hgrid)
  , CloverTermOdd(&Hgrid)
  , CloverTermInvEven(&Hgrid)
  , CloverTermInvOdd(&Hgrid)
  , CloverTermDagEven(&Hgrid)
  , CloverTermDagOdd(&Hgrid)
  , CloverTermInvDagEven(&Hgrid)
  , CloverTermInvDagOdd(&Hgrid) {
  assert(Nd == 4); // require 4 dimensions
  if(clover_anisotropy.isAnisotropic) {
    csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
    diag_mass = 4.0 + _mass;
  }
  csw_t = _csw_t * 0.5;
  if(csw_r == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
  if(csw_t == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
  ImportGauge(_Umu);
 }
 // *NOT* EO
 template <class Impl>
 void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
@@ -67,10 +107,13 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 template <class Impl>
 void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
  double t1 = usecond();
  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  double t2 = usecond();
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@@ -79,19 +122,22 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  double t3 = usecond();
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm  = Helpers::fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;
  double t4 = usecond();
  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;
  double t5 = usecond();
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
@@ -100,7 +146,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
+      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
@@ -125,6 +171,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
    });
  }
  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -137,6 +184,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
  double t7 = usecond();
 #if 0
  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
            << ", allocations = "               << (t2 - t1) / 1e6
            << ", field strength = "            << (t3 - t2) / 1e6
            << ", fill clover = "               << (t4 - t3) / 1e6
            << ", misc = "                      << (t5 - t4) / 1e6
            << ", inversions = "                << (t6 - t5) / 1e6
            << ", pick cbs = "                  << (t7 - t6) / 1e6
            << ", total = "                     << (t7 - t0) / 1e6
            << std::endl;
 #endif
 }
 template <class Impl>
@@ -167,7 +228,7 @@ template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
+  CloverField *Clover;
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  if (dag)
@@ -182,12 +243,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
+      Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
    }
  }
  else
@@ -205,18 +266,98 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
  }
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
 template <class Impl>
 void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
      continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 // Derivative parts
 template <class Impl>
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_32=ref()(3)(2);}
 #define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);	\
+  permute##dir(Chi_00,Chi_00);			\
-      permute##dir(Chi_01,Chi_01);\
+  permute##dir(Chi_01,Chi_01);			\
-      permute##dir(Chi_02,Chi_02);\
+  permute##dir(Chi_02,Chi_02);			\
-      permute##dir(Chi_10,Chi_10);	\
+  permute##dir(Chi_10,Chi_10);			\
-      permute##dir(Chi_11,Chi_11);\
+  permute##dir(Chi_11,Chi_11);			\
-      permute##dir(Chi_12,Chi_12);
+  permute##dir(Chi_12,Chi_12);
 #endif
@@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;
 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  {int ptype;					\
-  offset = SE->_offset;				\
+   SE=st.GetEntry(ptype,DIR,ss);		\
-  local  = SE->_is_local;			\
+   auto offset = SE->_offset;			\
-  perm   = SE->_permute;			\
+   auto local  = SE->_is_local;			\
-  if ( local ) {				\
+   auto perm   = SE->_permute;			\
-    LOAD_CHIMU(PERM);				\
+   if ( local ) {				\
-    PROJ;					\
+     LOAD_CHIMU(PERM);				\
-    if ( perm) {				\
+     PROJ;					\
-      PERMUTE_DIR(PERM);			\
+     if ( perm) {				\
-    }						\
+       PERMUTE_DIR(PERM);			\
-  } else {					\
+     }						\
-    LOAD_CHI;					\
+   } else {					\
-  }						\
+     LOAD_CHI;					\
-  acceleratorSynchronise();			\
+   }						\
-  MULT_2SPIN(DIR);				\
+   acceleratorSynchronise();			\
-  RECON;					
+   MULT_2SPIN(DIR);				\
   RECON;					}
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
-  SE=&st_p[DIR+8*ss];				\
+  { SE=&st_p[DIR+8*ss];						\
-  ptype=st_perm[DIR];				\
+  auto ptype=st_perm[DIR];					\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;					\
-  local  = SE->_is_local;			\
+  auto local  = SE->_is_local;					\
-  perm   = SE->_permute;			\
+  auto perm   = SE->_permute;					\
-  if ( local ) {				\
+  if ( local ) {						\
-    LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);						\
-    PROJ;					\
+    PROJ;							\
-    if ( perm) {				\
+    if ( perm) {						\
-      PERMUTE_DIR(PERM);			\
+      PERMUTE_DIR(PERM);					\
-    }						\
+    }								\
-  } else {					\
+  } else {							\
-    LOAD_CHI;					\
+    LOAD_CHI;							\
-  }						\
+  }								\
-  acceleratorSynchronise();			\
+  acceleratorSynchronise();					\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN(DIR);						\
-  RECON;					
+  RECON;					}
 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
+  { SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
+    auto ptype=st_perm[DIR];						\
- /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
+    auto offset = SE->_offset;						\
-  perm   = SE->_permute;			\
+    auto perm   = SE->_permute;						\
-  LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);							\
-  PROJ;						\
+    PROJ;								\
-  MULT_2SPIN(DIR);				\
+    MULT_2SPIN(DIR);							\
-  RECON;					
+    RECON;					}
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  local  = SE->_is_local;			\
+  auto offset = SE->_offset;					\
-  perm   = SE->_permute;			\
+  auto local  = SE->_is_local;					\
-  if ( local ) {				\
+  auto perm   = SE->_permute;					\
-    LOAD_CHIMU(PERM);				\
+  if ( local ) {						\
-    PROJ;					\
+    LOAD_CHIMU(PERM);						\
-    if ( perm) {				\
+    PROJ;							\
-      PERMUTE_DIR(PERM);			\
+    if ( perm) {						\
-    }						\
+      PERMUTE_DIR(PERM);					\
-  } else if ( st.same_node[DIR] ) {		\
+    }								\
-    LOAD_CHI;					\
+  } else if ( st.same_node[DIR] ) {				\
-  }						\
+    LOAD_CHI;							\
-  acceleratorSynchronise();			\
+  }								\
-  if (local || st.same_node[DIR] ) {		\
+  acceleratorSynchronise();					\
-    MULT_2SPIN(DIR);				\
+  if (local || st.same_node[DIR] ) {				\
-    RECON;					\
+    MULT_2SPIN(DIR);						\
-  }						\
+    RECON;							\
-  acceleratorSynchronise();			
+  }								\
  acceleratorSynchronise();			}
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+  auto offset = SE->_offset;				\
-    LOAD_CHI;					\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
-    MULT_2SPIN(DIR);				\
+    LOAD_CHI;						\
-    RECON;					\
+    MULT_2SPIN(DIR);					\
-    nmu++;					\
+    RECON;						\
-  }						\
+    nmu++;						\
-  acceleratorSynchronise();			
+  }							\
  acceleratorSynchronise();			}
-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss)					\
-  {						\
+  {							\
-    SiteSpinor & ref (out[ss]);			\
+    SiteSpinor & ref (out[ss]);				\
    coalescedWrite(ref()(0)(0),result_00,lane);		\
    coalescedWrite(ref()(0)(1),result_01,lane);		\
    coalescedWrite(ref()(0)(2),result_02,lane);		\
@@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@@ -0,0 +1,41 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CompactWilsonCloverFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
@@ -40,7 +40,7 @@ EOF
 done
-CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
 for impl in $WILSON_IMPL_LIST
 do
@@ -78,6 +78,8 @@ public:
  typedef Lattice<SiteLink>    LinkField; 
  typedef Lattice<SiteField>   Field;
  typedef SU<Nrepresentation> Group;
  // Guido: we can probably separate the types from the HMC functions
  // this will create 2 kind of implementations
  // probably confusing the users
@@ -118,7 +120,7 @@ public:
    LinkField Pmu(P.Grid());
    Pmu = Zero();
    for (int mu = 0; mu < Nd; mu++) {
-      SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
      Pmu = Pmu*scale;
      PokeIndex<LorentzIndex>(P, Pmu, mu);
@@ -159,15 +161,15 @@ public:
  }
  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::HotConfiguration(pRNG, U);
+    Group::HotConfiguration(pRNG, U);
  }
  static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::TepidConfiguration(pRNG, U);
+    Group::TepidConfiguration(pRNG, U);
  }
  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::ColdConfiguration(pRNG, U);
+    Group::ColdConfiguration(pRNG, U);
  }
 };
@@ -1,61 +1,63 @@
-Using HMC in Grid version 0.5.1
+# Using HMC in Grid
-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
+These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
+Disclaimer: Grid is still under active development so any information here can be changed in future releases.
-Command line options
+## Command line options
-===================
+
-(relevant file GenericHMCrunner.h)
+(relevant file `GenericHMCrunner.h`)
 The initial configuration can be changed at the command line using 
--StartType <your choice>
+`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
-valid choices, one among these
+`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
-HotStart, ColdStart, TepidStart, CheckpointStart
+Default: `--StartingType HotStart`
 default: HotStart
-example
+Example:
-./My_hmc_exec  --StartType HotStart
+```
 ./My_hmc_exec  --StartingType HotStart
 ```
-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
+The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
+`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
-default: 0
+Default: `--StartingTrajectory 0`
 The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
+`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
-default: 1
+Default: `--Trajectories 1`
 The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
+`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
-default: 10
+Default: `--Thermalizations 10`
 Any other parameter is defined in the source for the executable.
-HMC controls
+## HMC controls
 ===========
 The lines 
 ```
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
  std::vector<int> ParSeed({6, 7, 8, 9, 10});
 ```
 define the seeds for the serial and the parallel RNG.
 The line 
 ```
  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
 ```
 declares the number of molecular dynamics steps and the total trajectory length.
-Actions
+## Actions
 ======
-Action names are defined in the file
+Action names are defined in the directory `Grid/qcd/action`.
 lib/qcd/Actions.h
-Gauge actions list:
+Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):
 ```
 WilsonGaugeActionR;
 WilsonGaugeActionF;
 WilsonGaugeActionD;
@@ -68,8 +70,9 @@ IwasakiGaugeActionD;
 SymanzikGaugeActionR;
 SymanzikGaugeActionF;
 SymanzikGaugeActionD;
 ```
-
+```
 ConjugateWilsonGaugeActionR;
 ConjugateWilsonGaugeActionF;
 ConjugateWilsonGaugeActionD;
@@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
 ConjugateSymanzikGaugeActionR;
 ConjugateSymanzikGaugeActionF;
 ConjugateSymanzikGaugeActionD;
 ```
 Each of these action accepts one single parameter at creation time (beta).
 Example for creating a Symanzik action with beta=4.0
 ```
  SymanzikGaugeActionR(4.0)
 ```
 Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
 ```
 ScalarActionR;
 ScalarActionF;
 ScalarActionD;
 ```
-
+The suffixes `R`, `F`, `D` in the action names refer to the `Real`
-each of these action accept one single parameter at creation time (beta).
+(the precision is defined at compile time by the `--enable-precision` flag in the configure),
-Example for creating a Symanzik action with beta=4.0
+`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
 	SymanzikGaugeActionR(4.0)
 The suffixes R,F,D in the action names refer to the Real
 (the precision is defined at compile time by the --enable-precision flag in the configure),
 Float and Double, that force the precision of the action to be 32, 64 bit respectively.
@@ -322,8 +322,8 @@ public:
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
-    int recv_from_rank;
+    //    int recv_from_rank;
-    int xmit_to_rank;
+    //    int xmit_to_rank;
    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
@@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }
-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }
-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
@@ -74,31 +74,43 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_DEFAULT_GPU
  int device = 0;
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
-    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: assume user either uses\n");
    printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
 #else
  int device = rank;
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
  printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
  cudaSetDevice(rank);
 #endif
  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
    cudaDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -156,16 +168,25 @@ void acceleratorInit(void)
 #ifdef GRID_DEFAULT_GPU
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: using default device \n");
-    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
+    printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
-    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
  }
  int device = 0;
 #else
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
-    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
  }
-  hipSetDevice(rank);
+  int device = rank;
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
    hipDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 }
 #endif
@@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 #ifdef GRID_CUDA
 #include <cuda.h>
 #ifdef __CUDA_ARCH__
@@ -115,6 +116,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific
 inline void cuda_mem(void)
 {
  size_t free_t,total_t,used_t;
  cudaMemGetInfo(&free_t,&total_t);
  used_t=total_t-free_t;
  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
 }
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@@ -254,6 +263,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@@ -330,7 +340,7 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {
  theGridAccelerator->memcpy(to,from,bytes);
 }
-inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
@@ -361,10 +371,11 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline
 extern hipStream_t copyStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-  return hipThreadIdx_z; 
+  return hipThreadIdx_x; 
 #else
  return 0;
 #endif
@@ -378,19 +389,41 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      { __VA_ARGS__;}							\
    };									\
    int nt=acceleratorThreads();					\
-    dim3 hip_threads(nt,1,nsimd);					\
+    dim3 hip_threads(nsimd, nt, 1);					 \
-    dim3 hip_blocks ((num1+nt-1)/nt,num2,1);				\
+    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
-    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
+    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
-		       0,0,						\
+      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-		       num1,num2,nsimd,lambda);				\
+            0,0,						\
            num1,num2,nsimd, lambda);				\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
            0,0,						\
            num1,num2,nsimd, lambda);				\
    } \
  }
 template<typename lambda>  __global__
 __launch_bounds__(64,1)
 void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
  // Following the same scheme as CUDA for now
  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
  uint64_t z = threadIdx.x;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
 }
 template<typename lambda>  __global__
 __launch_bounds__(1024,1)
 void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
-  uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
+  // Following the same scheme as CUDA for now
-  uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
-  uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
  uint64_t z = threadIdx.x;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
@@ -435,10 +468,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-inline void acceleratorCopySynchronise(void) {  }
+//inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
 #endif
 //////////////////////////////////////////////
@@ -509,18 +548,12 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
-accelerator_inline void acceleratorSynchronise(void) 
+accelerator_inline void acceleratorSynchronise(void)  // Only Nvidia needs 
 {
 #ifdef GRID_SIMT
 #ifdef GRID_CUDA
  __syncwarp();
 #endif
 #ifdef GRID_SYCL
  //cl::sycl::detail::workGroupBarrier();
 #endif
 #ifdef GRID_HIP
  __syncthreads();
 #endif
 #endif
  return;
 }
@@ -88,7 +88,7 @@ public:
 // Coordinate class, maxdims = 8 for now.
 ////////////////////////////////////////////////////////////////
 #define GRID_MAX_LATTICE_DIMENSION (8)
-#define GRID_MAX_SIMD              (16)
+#define GRID_MAX_SIMD              (32)
 static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;
@@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
  return;
 }
 void GridCmdOptionFloat(std::string &str,float & val)
 {
  std::stringstream ss(str);
  ss>>val;
  return;
 }
 void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
@@ -527,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
@@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);
 void GridCmdOptionFloat(std::string &str,float & val);
 void GridParseLayout(char **argv,int argc,
@@ -137,7 +137,7 @@ int main (int argc, char ** argv)
  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
-  double          n = BENCH_IO_NPASS;
+  //  double          n = BENCH_IO_NPASS;
  stats(mean, stdDev, perf);
  stats(avMean, avStdDev, avPerf);
@@ -164,7 +164,7 @@ int main (int argc, char ** argv)
                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s\n",
              "L", "std read", "std write", "Grid read", "Grid write");
@@ -185,7 +185,7 @@ int main (int argc, char ** argv)
              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
  MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%12s %12s %12s %12s\n",
              "std read", "std write", "Grid read", "Grid write");
@@ -142,7 +142,7 @@ public:
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
-	int ncomm;
+	//	int ncomm;
 	double dbytes;
        for(int dir=0;dir<8;dir++) {
@@ -290,7 +290,7 @@ public:
      LatticeSU4 z(&Grid); z=Zero();
      LatticeSU4 x(&Grid); x=Zero();
      LatticeSU4 y(&Grid); y=Zero();
-      double a=2.0;
+      //      double a=2.0;
      uint64_t Nloop=NLOOP;
@@ -72,7 +72,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
-  time_statistics timestat;
+  //  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
@@ -126,19 +126,10 @@ int main (int argc, char ** argv)
  // Naive wilson implementation
  ////////////////////////////////////
  // replicate across fifth dimension
-  LatticeGaugeFieldF Umu5d(FGrid);
+  //  LatticeGaugeFieldF Umu5d(FGrid);
-  std::vector<LatticeColourMatrixF> U(4,FGrid);
+  std::vector<LatticeColourMatrixF> U(4,UGrid);
  {
    autoView( Umu5d_v, Umu5d, CpuWrite);
    autoView( Umu_v  , Umu  , CpuRead);
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
      for(int s=0;s<Ls;s++){
 	Umu5d_v[Ls*ss+s] = Umu_v[ss];
      }
    }
  }
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
@@ -147,10 +138,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	  }
 	}
      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@@ -182,7 +191,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =3000;
+  int ncall =300;
  if (1) {
    FGrid->Barrier();
@@ -242,16 +251,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
+	autoView( U_v  , U[mu]  , CpuRead);
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    int i=s+Ls*ss;
 	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
 	  }
 	}
      }
-
+      
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
@@ -184,8 +184,10 @@ int main (int argc, char ** argv)
      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
-
+	       <<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
 	       <<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
      assert(nn==nn);
  }    
  Grid_finalize();
@@ -4,7 +4,7 @@ using namespace Grid;
 template<class Field>
 void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
 {
-    RealD cp, c, alpha, d, beta, ssq, qq;
+    RealD cp, c, alpha, d, beta, ssq;
    RealD Tolerance=1.0e-10;
    int MaxIterations=10000;
@@ -0,0 +1,539 @@
 /*
 * Warning: This code illustrative only: not well tested, and not meant for production use
 * without regression / tests being applied
 */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out + 2.0*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 void MakePhase(Coordinate mom,LatticeComplex &phase)
 {
  GridBase *grid = phase.Grid();
  auto latt_size = grid->GlobalDimensions();
  ComplexD ci(0.0,1.0);
  phase=Zero();
  LatticeComplex coor(phase.Grid());
  for(int mu=0;mu<Nd;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    phase = phase + (TwoPiL * mom[mu]) * coor;
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
  source=Zero();
  SpinColourMatrix kronecker; kronecker=1.0;
  pokeSite(kronecker,source,coor);
 }
 void GFWallSource(int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex one(grid); one = ComplexD(1.0,0.0);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  LatticeCoordinate(t,Tdir);
  one = where(t==Integer(tslice), one, zz);
  source = 1.0;
  source = source * one;
 }
 void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex noise(grid);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  RealD nrm=1.0/sqrt(2);
  bernoulli(RNG, noise); // 0,1 50:50
  noise = (2.*noise - Complex(1,1))*nrm;
  LatticeCoordinate(t,Tdir);
  noise = where(t==Integer(tslice), noise, zz);
  source = 1.0;
  source = source*noise;
  std::cout << " Z2 wall " << norm2(source) << std::endl;
 }
 void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
  Real width = 2.0;
  Real coeff = (width*width) / Real(4*Iterations);
  Field tmp(U.Grid());
  smeared=unsmeared;
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(smeared,tmp);
    smeared = smeared - coeff*tmp;
    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
  }
 }
 void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
 {
  LatticePropagator tmp(source.Grid());
  PointSource(site,source);
  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
  tmp = source;
  GaussianSmear(U,tmp,source);
  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
 }
 void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
 {
  Z2WallSource(RNG,tslice,source);
  auto tmp = source;
  GaussianSmear(U,tmp,source);
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
  assert(mom.size()==Nd);
  assert(mom[Tdir] == 0);
  GridBase * grid = spectator.Grid();
  LatticeInteger ts(grid);
  LatticeCoordinate(ts,Tdir);
  source = Zero();
  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
  LatticeComplex phase(grid);
  MakePhase(mom,phase);
  source = source *phase;
 }
 template<class Action>
 void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
 {
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
  LatticeFermion src4  (UGrid); 
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
      D.ImportPhysicalFermionSource(src4,src5);
      result5=Zero();
      schur(D,src5,result5,ZG);
      std::cout<<GridLogMessage
 	       <<"spin "<<s<<" color "<<c
 	       <<" norm2(src5d) "   <<norm2(src5)
               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
      D.ExportPhysicalFermionSolution(result5,result4);
      FermToProp<Action>(propagator,result4,s,c);
    }
  }
 }
 class MesonFile: Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
 };
 void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void Meson3pt(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaX},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaY},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaZ},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaT}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  int nt=q1.size();
  std::vector<Complex> meson_CF(nt);
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
      corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 int make_idx(int p, int m,int nmom)
 {
  if (m==0) return p;
  assert(p==0);
  return nmom + m - 1;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  // Double precision grids
  auto latt = GridDefaultLatt();
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  LatticeGaugeField Umu(UGrid);
  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
    FieldMetaData header;
    NerscIO::readConfiguration(Umu, header, argv[1]);
    config=argv[1];
  }
  else
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
    config="ColdConfig";
  }
  //  GaugeFix(Umu,Utmp);
  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  LinkSmear(nsmr,rho,Umu,Usmr);
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
  std::vector<RealD> cs   ({ 0.0,0.0,0.5} );  // DDM
  std::vector<int>   Ls_s ({ 16,16,12} );
  std::vector<GridCartesian *> FGrids;
  std::vector<GridRedBlackCartesian *> FrbGrids;
  std::vector<Coordinate> momenta;
  momenta.push_back(Coordinate({0,0,0,0}));
  momenta.push_back(Coordinate({1,0,0,0}));
  momenta.push_back(Coordinate({2,0,0,0}));
  int nmass = masses.size();
  int nmom  = momenta.size();
  std::vector<MobiusFermionR *> FermActs;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
    RealD mass = masses[m];
    RealD M5   = M5s[m];
    RealD b    = bs[m];
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
  LatticePropagator phased_prop(UGrid);
  int tslice = 0;
  int tseq=(tslice+16)%latt[Nd-1];
  //////////////////////////////////////////////////////////////////////
  // RNG seeded for Z2 wall
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticeComplex> phase(nmom,UGrid);
  for(int m=0;m<nmom;m++){
    MakePhase(momenta[m],phase[m]);
  }
  std::vector<LatticePropagator> Z2Props   (nmom+nmass-1,UGrid);
  std::vector<LatticePropagator> GFProps   (nmom+nmass-1,UGrid);
  for(int p=0;p<nmom;p++) {
    int m=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  for(int m=1;m<nmass;m++) {
    int p=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source;
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source;
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  std::vector<std::vector<Propagator> > wsnk_z2Props(nmom+nmass-1);
  std::vector<std::vector<Propagator> > wsnk_gfProps(nmom+nmass-1);
  // Non-zero kaon and point and D two point
  // WW stick momentum on m1 (lighter)
  //     zero momentum on m2
  for(int m1=0;m1<nmass;m1++) {
  for(int m2=m1;m2<nmass;m2++) {
    int pmax = (m1==0)? nmom:1;
    for(int p=0;p<pmax;p++){
      std::stringstream ssg,ssz;
      std::stringstream wssg,wssz;
      int idx1 = make_idx(p,m1,nmom);
      int idx2 = make_idx(0,m2,nmom);
      /// Point sinks
      ssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
      ssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
      MesonTrace(ssz.str(),Z2Props[idx1],Z2Props[idx2],phase[p]); // Q1 is conjugated
      MesonTrace(ssg.str(),GFProps[idx1],GFProps[idx2],phase[p]); 
      /// Wall sinks
      wssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
      wssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
      phased_prop = GFProps[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(GFProps[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
      phased_prop = Z2Props[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(Z2Props[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
    }
  }}
  /////////////////////////////////////
  // Sequential solves
  /////////////////////////////////////
  LatticePropagator  seq_wsnk_z2src(UGrid);
  LatticePropagator  seq_wsnk_gfsrc(UGrid);
  LatticePropagator  seq_psnk_z2src(UGrid);
  LatticePropagator  seq_psnk_gfsrc(UGrid);
  LatticePropagator source(UGrid);
  for(int m=0;m<nmass-1;m++){
    int spect_idx = make_idx(0,m,nmom);
    int charm=nmass-1;
    SequentialSource(tseq,momenta[0],GFProps[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_gfsrc);
    SequentialSource(tseq,momenta[0],Z2Props[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_z2src);
    // Todo need wall sequential solve
    for(int p=0;p<nmom;p++){
      int active_idx = make_idx(p,0,nmom);
      std::stringstream seq_3pt_p_z2;
      std::stringstream seq_3pt_p_gf;
      std::stringstream seq_3pt_w_z2;
      std::stringstream seq_3pt_w_gf;
      seq_3pt_p_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_z2_meson.xml";
      seq_3pt_p_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_gf_meson.xml";
      seq_3pt_w_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_z2_meson.xml";
      seq_3pt_w_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_gf_meson.xml";
      Meson3pt(seq_3pt_p_gf.str(),GFProps[active_idx],seq_psnk_gfsrc,phase[p]);
      Meson3pt(seq_3pt_p_z2.str(),Z2Props[active_idx],seq_psnk_z2src,phase[p]);
    }    
  }
  Grid_finalize();
 }
@@ -9,6 +9,7 @@ using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
@@ -55,6 +56,16 @@ void MakePhase(Coordinate mom,LatticeComplex &phase)
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
@@ -97,23 +108,23 @@ void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
-  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Ufix);
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
@@ -167,19 +178,21 @@ void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
-  LatticeFermion src4  (UGrid); 
+  LatticeFermion src4  (UGrid); src4 = Zero();
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  std::cout<<GridLogMessage<< " source4 "<<norm2(source)<<std::endl;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
-
+      std::cout<<GridLogMessage<< s<<c<<" src4 "<<norm2(src4)<<std::endl;
      D.ImportPhysicalFermionSource(src4,src5);
      std::cout<<GridLogMessage<< s<<c<<" src5 "<<norm2(src5)<<std::endl;
      result5=Zero();
      schur(D,src5,result5,ZG);
@@ -287,15 +300,10 @@ int main (int argc, char ** argv)
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  std::vector<int> seeds4({1,2,3,4}); 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  LatticeGaugeField Ufixed(UGrid);
+  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
@@ -308,13 +316,20 @@ int main (int argc, char ** argv)
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
-    //    SU<Nc>::HotConfiguration(RNG4,Umu);
+    config="ColdConfig";
    config="HotConfig";
  }
-  GaugeFix(Umu,Ufixed);
+  //  GaugeFix(Umu,Utmp);
-  Umu=Ufixed;
+  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  RealD plaq_gf =WilsonLoops<GimplR>::avgPlaquette(Umu);
  LinkSmear(nsmr,rho,Umu,Usmr);
  RealD plaq_smr=WilsonLoops<GimplR>::avgPlaquette(Usmr);
  std::cout << GridLogMessage << " GF Plaquette " <<plaq_gf<<std::endl;
  std::cout << GridLogMessage << " SM Plaquette " <<plaq_smr<<std::endl;
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
@@ -330,6 +345,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
@@ -339,30 +357,40 @@ int main (int argc, char ** argv)
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
-    FermActs.push_back(new MobiusFermionR(Umu,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator point_source(UGrid);
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
-  Coordinate Origin({0,0,0,0});
+  int tslice = 0;
-  PointSource   (Origin,point_source);
+  //////////////////////////////////////////////////////////////////////
-  Z2WallSource  (RNG4,0,z2wall_source);
+  // RNG seeded for Z2 wall
-  GFWallSource  (0,gfwall_source);
+  //////////////////////////////////////////////////////////////////////
-  
+  // You can manage seeds however you like.
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  // Recommend SeedUniqueString.
-  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
+  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  std::vector<LatticePropagator> GFProps   (nmass,UGrid);
  for(int m=0;m<nmass;m++) {
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<<std::endl;
    Solve(*FermActs[m],z2wall_source    ,Z2Props[m]);
    std::cout << GridLogMessage << " Mass " <<m << " gfwall source "<<norm2(gfwall_source)<<std::endl;
    Solve(*FermActs[m],gfwall_source    ,GFProps[m]);
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<< " " << norm2(gfwall_source)<<std::endl;
  }
@@ -383,14 +411,15 @@ int main (int argc, char ** argv)
    std::stringstream wssg,wssz;
    /// Point sinks
-    ssg<<config<< "_m" << m1 << "_m"<< m2 << "p_gf_meson.xml";
+    ssg<<config<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "p_z2_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
    MesonTrace(ssg.str(),GFProps[m1],GFProps[m2],phase);
    /// Wall sinks
-    wssg<<config<< "_m" << m1 << "_m"<< m2 << "w_gf_meson.xml";
+    wssg<<config<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
-    wssz<<config<< "_m" << m1 << "_m"<< m2 << "w_z2_meson.xml";
+    wssz<<config<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
    WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
    WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
@@ -0,0 +1,12 @@
 ../../configure --enable-comms=mpi-auto \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
 LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
 HIPFLAGS = --amdgpu-target=gfx90a
@@ -0,0 +1,30 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 ##SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 1
 #SBATCH --exclusive  
 DIR=.
 module list
 #export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=1
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 24.24.24.24 --shm-mpi 0 --mpi 1.1.1.1"
 srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
@@ -0,0 +1,27 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 4
 #SBATCH --exclusive
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=4
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
@@ -0,0 +1,27 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 8
 #SBATCH --exclusive
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=1
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads 8 --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun --gpus-per-task 1 -n8 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
@@ -0,0 +1,12 @@
 #!/bin/bash
 lrank=$SLURM_LOCALID
 export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID
 echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING"
 $*
@@ -0,0 +1,5 @@
 module load PrgEnv-gnu
 module load rocm/4.5.0
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
@@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J comms
 #SBATCH -o comms.%J
 #SBATCH -e comms.%J
 #SBATCH -N 1
 #SBATCH -n 2
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 64.64.32.32 --mpi 2.1.1.1 "
 srun -n2 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_comms_host_device $PARAMS
@@ -0,0 +1,12 @@
 ../../configure --enable-comms=mpi-auto \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
 --prefix=/ccs/home/chulwoo/Grid \
 LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
@@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 1
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap"
 srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
@@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 4
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
@@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 2
 #SBATCH -n 8
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
@@ -0,0 +1,12 @@
 #!/bin/bash
 lrank=$SLURM_LOCALID
 export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID
 echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING"
 $*
@@ -0,0 +1,5 @@
 module load PrgEnv-gnu
 module load rocm/4.3.0
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx908
@@ -0,0 +1,179 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200060000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 1073741824 byte stencil comms buffers 
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 Grid : Message : MemoryManager Cache 13529146982 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 2.137929 s : Grid is setup to use 6 threads
 Grid : Message : 2.137941 s : Number of iterations to average: 250
 Grid : Message : 2.137950 s : ====================================================================================================
 Grid : Message : 2.137958 s : = Benchmarking sequential halo exchange from host memory 
 Grid : Message : 2.137966 s : ====================================================================================================
 Grid : Message : 2.137974 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.604949 s :    8	8	     393216       89973.9  		179947.8
 Grid : Message : 2.668249 s :    8	8	     393216       18650.3  		37300.5
 Grid : Message : 2.732288 s :    8	8	     393216       18428.5  		36857.1
 Grid : Message : 2.753565 s :    8	8	     393216       55497.2  		110994.4
 Grid : Message : 2.808960 s :   12	8	    1327104       100181.5  		200363.0
 Grid : Message : 3.226900 s :   12	8	    1327104       20600.5  		41201.0
 Grid : Message : 3.167459 s :   12	8	    1327104       24104.6  		48209.2
 Grid : Message : 3.227660 s :   12	8	    1327104       66156.7  		132313.5
 Grid : Message : 3.413570 s :   16	8	    3145728       56174.4  		112348.8
 Grid : Message : 3.802697 s :   16	8	    3145728       24255.9  		48511.7
 Grid : Message : 4.190498 s :   16	8	    3145728       24336.7  		48673.4
 Grid : Message : 4.385171 s :   16	8	    3145728       48484.1  		96968.2
 Grid : Message : 4.805284 s :   20	8	    6144000       46380.5  		92761.1
 Grid : Message : 5.562975 s :   20	8	    6144000       24328.5  		48656.9
 Grid : Message : 6.322562 s :   20	8	    6144000       24266.7  		48533.4
 Grid : Message : 6.773598 s :   20	8	    6144000       40868.5  		81736.9
 Grid : Message : 7.600999 s :   24	8	   10616832       40198.3  		80396.6
 Grid : Message : 8.912917 s :   24	8	   10616832       24279.5  		48559.1
 Grid : Message : 10.220961 s :   24	8	   10616832       24350.2  		48700.4
 Grid : Message : 11.728250 s :   24	8	   10616832       37390.9  		74781.8
 Grid : Message : 12.497258 s :   28	8	   16859136       36792.2  		73584.5
 Grid : Message : 14.585387 s :   28	8	   16859136       24222.2  		48444.3
 Grid : Message : 16.664783 s :   28	8	   16859136       24323.4  		48646.8
 Grid : Message : 17.955238 s :   28	8	   16859136       39194.7  		78389.4
 Grid : Message : 20.136479 s :   32	8	   25165824       35718.3  		71436.5
 Grid : Message : 23.241958 s :   32	8	   25165824       24311.4  		48622.9
 Grid : Message : 26.344810 s :   32	8	   25165824       24331.9  		48663.7
 Grid : Message : 28.384420 s :   32	8	   25165824       37016.3  		74032.7
 Grid : Message : 28.388879 s : ====================================================================================================
 Grid : Message : 28.388894 s : = Benchmarking sequential halo exchange from GPU memory 
 Grid : Message : 28.388909 s : ====================================================================================================
 Grid : Message : 28.388924 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 Grid : Message : 28.553993 s :    8	8	     393216       8272.4  		16544.7
 Grid : Message : 28.679592 s :    8	8	     393216       9395.4  		18790.8
 Grid : Message : 28.811112 s :    8	8	     393216       8971.0  		17942.0
 Grid : Message : 28.843770 s :    8	8	     393216       36145.6  		72291.2
 Grid : Message : 28.981754 s :   12	8	    1327104       49591.6  		99183.2
 Grid : Message : 29.299764 s :   12	8	    1327104       12520.8  		25041.7
 Grid : Message : 29.620288 s :   12	8	    1327104       12422.2  		24844.4
 Grid : Message : 29.657645 s :   12	8	    1327104       106637.5  		213275.1
 Grid : Message : 29.952933 s :   16	8	    3145728       43939.2  		87878.5
 Grid : Message : 30.585411 s :   16	8	    3145728       14922.1  		29844.2
 Grid : Message : 31.219781 s :   16	8	    3145728       14877.2  		29754.4
 Grid : Message : 31.285017 s :   16	8	    3145728       144724.3  		289448.7
 Grid : Message : 31.706443 s :   20	8	    6144000       54676.2  		109352.4
 Grid : Message : 32.739205 s :   20	8	    6144000       17848.0  		35696.1
 Grid : Message : 33.771852 s :   20	8	    6144000       17849.9  		35699.7
 Grid : Message : 33.871981 s :   20	8	    6144000       184141.4  		368282.8
 Grid : Message : 34.536808 s :   24	8	   10616832       55784.3  		111568.6
 Grid : Message : 36.275648 s :   24	8	   10616832       18317.6  		36635.3
 Grid : Message : 37.997181 s :   24	8	   10616832       18501.7  		37003.4
 Grid : Message : 38.140442 s :   24	8	   10616832       222383.9  		444767.9
 Grid : Message : 39.177222 s :   28	8	   16859136       56609.7  		113219.4
 Grid : Message : 41.874755 s :   28	8	   16859136       18749.9  		37499.8
 Grid : Message : 44.529381 s :   28	8	   16859136       19052.9  		38105.8
 Grid : Message : 44.742192 s :   28	8	   16859136       237717.1  		475434.2
 Grid : Message : 46.184000 s :   32	8	   25165824       57091.2  		114182.4
 Grid : Message : 50.734740 s :   32	8	   25165824       19411.0  		38821.9
 Grid : Message : 53.931228 s :   32	8	   25165824       19570.6  		39141.2
 Grid : Message : 54.238467 s :   32	8	   25165824       245765.6  		491531.2
 Grid : Message : 54.268664 s : ====================================================================================================
 Grid : Message : 54.268680 s : = All done; Bye Bye
 Grid : Message : 54.268691 s : ====================================================================================================
@@ -0,0 +1,14 @@
 ../../configure --enable-comms=mpi \
 	      --enable-simd=GPU \
 	      --enable-gen-simd-width=32 \
 	      --enable-unified=no \
 	       --enable-shm=nvlink \
 	       --disable-gparity \
 	       --enable-setdevice \
 	       --disable-fermion-reps \
 	       --enable-accelerator=cuda \
 	       --prefix /ccs/home/paboyle/prefix \
 	       CXX=nvcc \
 	       LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
 	       CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"
@@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.731905 s : Grid Layout
 Grid : Message : 1.731915 s : 	Global lattice size  : 48 48 48 72 
 Grid : Message : 1.731928 s : 	OpenMP threads       : 6
 Grid : Message : 1.731938 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.683494 s : Making s innermost grids
 Grid : Message : 2.780034 s : Initialising 4d RNG
 Grid : Message : 2.833099 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.833121 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.916841 s : Initialising 5d RNG
 Grid : Message : 3.762880 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 3.762902 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 5.264345 s : Initialised RNGs
 Grid : Message : 6.489904 s : Drawing gauge field
 Grid : Message : 6.729262 s : Random gauge initialised 
 Grid : Message : 7.781273 s : Setting up Cshift based reference 
 Grid : Message : 8.725313 s : *****************************************************************
 Grid : Message : 8.725332 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 8.725342 s : *****************************************************************
 Grid : Message : 8.725352 s : *****************************************************************
 Grid : Message : 8.725362 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 8.725372 s : * Vectorising space-time by 4
 Grid : Message : 8.725383 s : * VComplexF size is 32 B
 Grid : Message : 8.725395 s : * SINGLE precision 
 Grid : Message : 8.725405 s : * Using Overlapped Comms/Compute
 Grid : Message : 8.725415 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 8.725425 s : *****************************************************************
 Grid : Message : 9.465229 s : Called warmup
 Grid : Message : 58.646066 s : Called Dw 3000 times in 4.91764e+07 us
 Grid : Message : 58.646121 s : mflop/s =   1.02592e+07
 Grid : Message : 58.646134 s : mflop/s per rank =  427468
 Grid : Message : 58.646145 s : mflop/s per node =  2.56481e+06
 Grid : Message : 58.646156 s : RF  GiB/s (base 2) =   20846.5
 Grid : Message : 58.646166 s : mem GiB/s (base 2) =   13029.1
 Grid : Message : 58.648008 s : norm diff   1.04778e-13
 Grid : Message : 58.734885 s : #### Dhop calls report 
 Grid : Message : 58.734897 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 58.734909 s : WilsonFermion5D TotalTime   /Calls        : 8217.71 us
 Grid : Message : 58.734922 s : WilsonFermion5D CommTime    /Calls        : 7109.5 us
 Grid : Message : 58.734933 s : WilsonFermion5D FaceTime    /Calls        : 446.623 us
 Grid : Message : 58.734943 s : WilsonFermion5D ComputeTime1/Calls        : 18.0558 us
 Grid : Message : 58.734953 s : WilsonFermion5D ComputeTime2/Calls        : 731.097 us
 Grid : Message : 58.734979 s : Average mflops/s per call                : 4.8157e+09
 Grid : Message : 58.734989 s : Average mflops/s per call per rank       : 2.00654e+08
 Grid : Message : 58.734999 s : Average mflops/s per call per node       : 1.20393e+09
 Grid : Message : 58.735008 s : Average mflops/s per call (full)         : 1.04183e+07
 Grid : Message : 58.735017 s : Average mflops/s per call per rank (full): 434094
 Grid : Message : 58.735026 s : Average mflops/s per call per node (full): 2.60456e+06
 Grid : Message : 58.735035 s : WilsonFermion5D Stencil
 Grid : Message : 58.735043 s : WilsonFermion5D StencilEven
 Grid : Message : 58.735051 s : WilsonFermion5D StencilOdd
 Grid : Message : 58.735059 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 58.735067 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 58.735075 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 64.934380 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 64.934740 s : Called DwDag
 Grid : Message : 64.934870 s : norm dag result 12.0422
 Grid : Message : 64.120756 s : norm dag ref    12.0422
 Grid : Message : 64.149389 s : norm dag diff   7.6644e-14
 Grid : Message : 64.317786 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 64.465331 s : src_e0.499995
 Grid : Message : 64.524653 s : src_o0.500005
 Grid : Message : 64.558706 s : *********************************************************
 Grid : Message : 64.558717 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 64.558727 s : * Vectorising space-time by 4
 Grid : Message : 64.558737 s : * SINGLE precision 
 Grid : Message : 64.558745 s : * Using Overlapped Comms/Compute
 Grid : Message : 64.558753 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 64.558761 s : *********************************************************
 Grid : Message : 92.702145 s : Deo mflop/s =   8.97692e+06
 Grid : Message : 92.702185 s : Deo mflop/s per rank   374038
 Grid : Message : 92.702198 s : Deo mflop/s per node   2.24423e+06
 Grid : Message : 92.702209 s : #### Dhop calls report 
 Grid : Message : 92.702223 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 92.702240 s : WilsonFermion5D TotalTime   /Calls        : 9377.88 us
 Grid : Message : 92.702257 s : WilsonFermion5D CommTime    /Calls        : 8221.84 us
 Grid : Message : 92.702277 s : WilsonFermion5D FaceTime    /Calls        : 543.548 us
 Grid : Message : 92.702301 s : WilsonFermion5D ComputeTime1/Calls        : 20.936 us
 Grid : Message : 92.702322 s : WilsonFermion5D ComputeTime2/Calls        : 732.33 us
 Grid : Message : 92.702376 s : Average mflops/s per call                : 4.13001e+09
 Grid : Message : 92.702387 s : Average mflops/s per call per rank       : 1.72084e+08
 Grid : Message : 92.702397 s : Average mflops/s per call per node       : 1.0325e+09
 Grid : Message : 92.702407 s : Average mflops/s per call (full)         : 9.12937e+06
 Grid : Message : 92.702416 s : Average mflops/s per call per rank (full): 380391
 Grid : Message : 92.702426 s : Average mflops/s per call per node (full): 2.28234e+06
 Grid : Message : 92.702435 s : WilsonFermion5D Stencil
 Grid : Message : 92.702443 s : WilsonFermion5D StencilEven
 Grid : Message : 92.702451 s : WilsonFermion5D StencilOdd
 Grid : Message : 92.702459 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 92.702467 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 92.702475 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 92.772983 s : r_e6.02121
 Grid : Message : 92.786384 s : r_o6.02102
 Grid : Message : 92.799622 s : res12.0422
 Grid : Message : 93.860500 s : norm diff   0
 Grid : Message : 93.162026 s : norm diff even  0
 Grid : Message : 93.197529 s : norm diff odd   0
@@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.544984 s : Grid Layout
 Grid : Message : 1.544992 s : 	Global lattice size  : 64 64 64 96 
 Grid : Message : 1.545003 s : 	OpenMP threads       : 6
 Grid : Message : 1.545011 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.994920 s : Making s innermost grids
 Grid : Message : 2.232502 s : Initialising 4d RNG
 Grid : Message : 2.397047 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.397069 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.653140 s : Initialising 5d RNG
 Grid : Message : 5.285347 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 5.285369 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 9.994738 s : Initialised RNGs
 Grid : Message : 13.153426 s : Drawing gauge field
 Grid : Message : 13.825697 s : Random gauge initialised 
 Grid : Message : 18.537657 s : Setting up Cshift based reference 
 Grid : Message : 22.296755 s : *****************************************************************
 Grid : Message : 22.296781 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 22.296791 s : *****************************************************************
 Grid : Message : 22.296800 s : *****************************************************************
 Grid : Message : 22.296809 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 22.296818 s : * Vectorising space-time by 4
 Grid : Message : 22.296828 s : * VComplexF size is 32 B
 Grid : Message : 22.296838 s : * SINGLE precision 
 Grid : Message : 22.296847 s : * Using Overlapped Comms/Compute
 Grid : Message : 22.296855 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 22.296863 s : *****************************************************************
 Grid : Message : 24.746452 s : Called warmup
 Grid : Message : 137.525756 s : Called Dw 3000 times in 1.12779e+08 us
 Grid : Message : 137.525818 s : mflop/s =   1.41383e+07
 Grid : Message : 137.525831 s : mflop/s per rank =  589097
 Grid : Message : 137.525843 s : mflop/s per node =  3.53458e+06
 Grid : Message : 137.525854 s : RF  GiB/s (base 2) =   28728.7
 Grid : Message : 137.525864 s : mem GiB/s (base 2) =   17955.5
 Grid : Message : 137.693645 s : norm diff   1.04885e-13
 Grid : Message : 137.965585 s : #### Dhop calls report 
 Grid : Message : 137.965598 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 137.965612 s : WilsonFermion5D TotalTime   /Calls        : 18899.7 us
 Grid : Message : 137.965624 s : WilsonFermion5D CommTime    /Calls        : 16041.4 us
 Grid : Message : 137.965634 s : WilsonFermion5D FaceTime    /Calls        : 859.705 us
 Grid : Message : 137.965644 s : WilsonFermion5D ComputeTime1/Calls        : 70.5881 us
 Grid : Message : 137.965654 s : WilsonFermion5D ComputeTime2/Calls        : 2094.8 us
 Grid : Message : 137.965682 s : Average mflops/s per call                : 3.87638e+09
 Grid : Message : 137.965692 s : Average mflops/s per call per rank       : 1.61516e+08
 Grid : Message : 137.965702 s : Average mflops/s per call per node       : 9.69095e+08
 Grid : Message : 137.965712 s : Average mflops/s per call (full)         : 1.43168e+07
 Grid : Message : 137.965721 s : Average mflops/s per call per rank (full): 596533
 Grid : Message : 137.965730 s : Average mflops/s per call per node (full): 3.5792e+06
 Grid : Message : 137.965740 s : WilsonFermion5D Stencil
 Grid : Message : 137.965748 s : WilsonFermion5D StencilEven
 Grid : Message : 137.965756 s : WilsonFermion5D StencilOdd
 Grid : Message : 137.965764 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 137.965772 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 137.965780 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 156.554605 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 156.554632 s : Called DwDag
 Grid : Message : 156.554642 s : norm dag result 12.0421
 Grid : Message : 156.639265 s : norm dag ref    12.0421
 Grid : Message : 156.888281 s : norm dag diff   7.62057e-14
 Grid : Message : 157.609797 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 158.208630 s : src_e0.499996
 Grid : Message : 158.162447 s : src_o0.500004
 Grid : Message : 158.267780 s : *********************************************************
 Grid : Message : 158.267791 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 158.267801 s : * Vectorising space-time by 4
 Grid : Message : 158.267811 s : * SINGLE precision 
 Grid : Message : 158.267820 s : * Using Overlapped Comms/Compute
 Grid : Message : 158.267828 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 158.267836 s : *********************************************************
 Grid : Message : 216.487829 s : Deo mflop/s =   1.37283e+07
 Grid : Message : 216.487869 s : Deo mflop/s per rank   572011
 Grid : Message : 216.487881 s : Deo mflop/s per node   3.43206e+06
 Grid : Message : 216.487893 s : #### Dhop calls report 
 Grid : Message : 216.487903 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 216.487913 s : WilsonFermion5D TotalTime   /Calls        : 19399.6 us
 Grid : Message : 216.487923 s : WilsonFermion5D CommTime    /Calls        : 16475.4 us
 Grid : Message : 216.487933 s : WilsonFermion5D FaceTime    /Calls        : 972.393 us
 Grid : Message : 216.487943 s : WilsonFermion5D ComputeTime1/Calls        : 49.8474 us
 Grid : Message : 216.487953 s : WilsonFermion5D ComputeTime2/Calls        : 2089.93 us
 Grid : Message : 216.488001 s : Average mflops/s per call                : 5.39682e+09
 Grid : Message : 216.488011 s : Average mflops/s per call per rank       : 2.24867e+08
 Grid : Message : 216.488020 s : Average mflops/s per call per node       : 1.3492e+09
 Grid : Message : 216.488030 s : Average mflops/s per call (full)         : 1.39479e+07
 Grid : Message : 216.488039 s : Average mflops/s per call per rank (full): 581162
 Grid : Message : 216.488048 s : Average mflops/s per call per node (full): 3.48697e+06
 Grid : Message : 216.488057 s : WilsonFermion5D Stencil
 Grid : Message : 216.488065 s : WilsonFermion5D StencilEven
 Grid : Message : 216.488073 s : WilsonFermion5D StencilOdd
 Grid : Message : 216.488081 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 216.488089 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 216.488097 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 217.384495 s : r_e6.02113
 Grid : Message : 217.426121 s : r_o6.02096
 Grid : Message : 217.472636 s : res12.0421
 Grid : Message : 218.200068 s : norm diff   0
 Grid : Message : 218.645673 s : norm diff even  0
 Grid : Message : 218.816561 s : norm diff odd   0
@@ -0,0 +1,25 @@
 #!/bin/bash
 #BSUB -P LGT104
 #BSUB -W 2:00
 #BSUB -nnodes 16
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
 APP="./benchmarks/Benchmark_comms_host_device  --mpi 4.4.4.3 "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log
@@ -0,0 +1,25 @@
 #!/bin/bash
 #BSUB -P LGT104
 #BSUB -W 2:00
 #BSUB -nnodes 4
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
 #export GRID_ALLOC_NCACHE_LARGE=1
 export APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.4node
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.24.4node
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.32.4node
@@ -0,0 +1,8 @@
 export UCX_GDR_COPY_RCACHE=no
 export UCX_MEMTYPE_CACHE=n
 export UCX_RNDV_SCHEME=put_zcopy
 module load gcc/7.5.0
 module load cuda/10.2.89
 #cuda/11.4.0
 export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH 
@@ -5,7 +5,7 @@
    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
    --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
-    --disable-accelerator-cshift \
+    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
@@ -1,2 +1,6 @@
-spack load c-lime
+module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1  
-module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
+#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
 export PREFIX=/home/tc002/tc002/shared/env/prefix/
 export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
 unset SBATCH_EXPORT
@@ -0,0 +1,226 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./tests/core/Test_compact_wilson_clover_speedup.cc
    Copyright (C) 2020 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer       <nils.meyer@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 NAMESPACE_BEGIN(CommandlineHelpers);
 static bool checkPresent(int* argc, char*** argv, const std::string& option) {
  return GridCmdOptionExists(*argv, *argv + *argc, option);
 }
 static std::string getContent(int* argc, char*** argv, const std::string& option) {
  return GridCmdOptionPayload(*argv, *argv + *argc, option);
 }
 static int readInt(int* argc, char*** argv, std::string&& option, int defaultValue) {
  std::string arg;
  int         ret = defaultValue;
  if(checkPresent(argc, argv, option)) {
    arg = getContent(argc, argv, option);
    GridCmdOptionInt(arg, ret);
  }
  return ret;
 }
 static float readFloat(int* argc, char*** argv, std::string&& option, float defaultValue) {
  std::string arg;
  float       ret = defaultValue;
  if(checkPresent(argc, argv, option)) {
    arg = getContent(argc, argv, option);
    GridCmdOptionFloat(arg, ret);
  }
  return ret;
 }
 NAMESPACE_END(CommandlineHelpers);
 #define _grid_printf(LOGGER, ...) \
  { \
    if((LOGGER).isActive()) { /* this makes it safe to put, e.g., norm2 in the calling code w.r.t. performance */ \
      char _printf_buf[1024]; \
      std::sprintf(_printf_buf, __VA_ARGS__); \
      std::cout << (LOGGER) << _printf_buf; \
      fflush(stdout); \
    } \
  }
 #define grid_printf_msg(...) _grid_printf(GridLogMessage, __VA_ARGS__)
 template<typename Field>
 bool resultsAgree(const Field& ref, const Field& res, const std::string& name) {
  RealD checkTolerance = (getPrecision<Field>::value == 2) ? 1e-15 : 1e-7;
  Field diff(ref.Grid());
  diff = ref - res;
  auto absDev = norm2(diff);
  auto relDev = absDev / norm2(ref);
  std::cout << GridLogMessage
            << "norm2(reference), norm2(" << name << "), abs. deviation, rel. deviation: " << norm2(ref) << " "
            << norm2(res) << " " << absDev << " " << relDev << " -> check "
            << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
  return relDev <= checkTolerance;
 }
 template<typename vCoeff_t>
 void runBenchmark(int* argc, char*** argv) {
  // read from command line
  const int   nIter        = CommandlineHelpers::readInt(     argc, argv, "--niter", 1000);
  const RealD mass         = CommandlineHelpers::readFloat(   argc, argv, "--mass",  0.5);
  const RealD csw          = CommandlineHelpers::readFloat(   argc, argv, "--csw",   1.0);
  const RealD cF           = CommandlineHelpers::readFloat(   argc, argv, "--cF",    1.0);
  const bool  antiPeriodic = CommandlineHelpers::checkPresent(argc, argv, "--antiperiodic");
  // precision
  static_assert(getPrecision<vCoeff_t>::value == 2 || getPrecision<vCoeff_t>::value == 1, "Incorrect precision"); // double or single
  std::string precision = (getPrecision<vCoeff_t>::value == 2 ? "double" : "single");
  // setup grids
  GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vCoeff_t::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  // clang-format on
  // setup rng
  std::vector<int> seeds({1, 2, 3, 4});
  GridParallelRNG  pRNG(UGrid);
  pRNG.SeedFixedIntegers(seeds);
  // type definitions
  typedef WilsonImpl<vCoeff_t, FundamentalRepresentation, CoeffReal> WImpl;
  typedef WilsonCloverFermion<WImpl> WilsonCloverOperator;
  typedef CompactWilsonCloverFermion<WImpl> CompactWilsonCloverOperator;
  typedef typename WilsonCloverOperator::FermionField Fermion;
  typedef typename WilsonCloverOperator::GaugeField Gauge;
  // setup fields
  Fermion src(UGrid); random(pRNG, src);
  Fermion ref(UGrid); ref = Zero();
  Fermion res(UGrid); res = Zero();
  Fermion hop(UGrid); hop = Zero();
  Fermion diff(UGrid); diff = Zero();
  Gauge   Umu(UGrid); SU3::HotConfiguration(pRNG, Umu);
  // setup boundary phases
  typename WilsonCloverOperator::ImplParams implParams;
  std::vector<Complex> boundary_phases(Nd, 1.);
  if(antiPeriodic) boundary_phases[Nd-1] = -1.;
  implParams.boundary_phases = boundary_phases;
  WilsonAnisotropyCoefficients anisParams;
  // misc stuff needed for benchmarks
  double volume=1.0; for(int mu=0; mu<Nd; mu++) volume*=UGrid->_fdimensions[mu];
  // setup fermion operators
  WilsonCloverOperator        Dwc(        Umu, *UGrid, *UrbGrid, mass, csw, csw,     anisParams, implParams);
  CompactWilsonCloverOperator Dwc_compact(Umu, *UGrid, *UrbGrid, mass, csw, csw, cF, anisParams, implParams);
  // now test the conversions
  typename CompactWilsonCloverOperator::CloverField         tmp_ref(UGrid);  tmp_ref  = Dwc.CloverTerm;
  typename CompactWilsonCloverOperator::CloverField         tmp_res(UGrid);  tmp_res  = Zero();
  typename CompactWilsonCloverOperator::CloverField         tmp_diff(UGrid); tmp_diff = Zero();
  typename CompactWilsonCloverOperator::CloverDiagonalField diagonal(UGrid); diagonal = Zero();
  typename CompactWilsonCloverOperator::CloverTriangleField triangle(UGrid); diagonal = Zero();
  CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(tmp_ref, diagonal, triangle);
  CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(diagonal, triangle, tmp_res);
  tmp_diff = tmp_ref - tmp_res;
  std::cout << GridLogMessage << "conversion: ref, res, diff, eps"
            << " " << norm2(tmp_ref)
            << " " << norm2(tmp_res)
            << " " << norm2(tmp_diff)
            << " " << norm2(tmp_diff) / norm2(tmp_ref)
            << std::endl;
  // performance per site (use minimal values necessary)
  double hop_flop_per_site            = 1320; // Rich's Talk + what Peter uses
  double hop_byte_per_site            = (8 * 9 + 9 * 12) * 2 * getPrecision<vCoeff_t>::value * 4;
  double clov_flop_per_site           = 504; // Rich's Talk and 1412.2629
  double clov_byte_per_site           = (2 * 18 + 12 + 12) * 2 * getPrecision<vCoeff_t>::value * 4;
  double clov_flop_per_site_performed = 1128;
  double clov_byte_per_site_performed = (12 * 12 + 12 + 12) * 2 * getPrecision<vCoeff_t>::value * 4;
  // total performance numbers
  double hop_gflop_total            = volume * nIter * hop_flop_per_site / 1e9;
  double hop_gbyte_total            = volume * nIter * hop_byte_per_site / 1e9;
  double clov_gflop_total           = volume * nIter * clov_flop_per_site / 1e9;
  double clov_gbyte_total           = volume * nIter * clov_byte_per_site / 1e9;
  double clov_gflop_performed_total = volume * nIter * clov_flop_per_site_performed / 1e9;
  double clov_gbyte_performed_total = volume * nIter * clov_byte_per_site_performed / 1e9;
  // warmup + measure dhop
  for(auto n : {1, 2, 3, 4, 5}) Dwc.Dhop(src, hop, 0);
  double t0 = usecond();
  for(int n = 0; n < nIter; n++) Dwc.Dhop(src, hop, 0);
  double t1 = usecond();
  double secs_hop = (t1-t0)/1e6;
  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n",
              "hop", precision.c_str(), secs_hop, hop_gflop_total/secs_hop, hop_gbyte_total/secs_hop, 0.0, secs_hop/secs_hop);
 #define BENCH_CLOVER_KERNEL(KERNEL) { \
  /* warmup + measure reference clover */ \
  for(auto n : {1, 2, 3, 4, 5}) Dwc.KERNEL(src, ref); \
  double t2 = usecond(); \
  for(int n = 0; n < nIter; n++) Dwc.KERNEL(src, ref); \
  double t3 = usecond(); \
  double secs_ref = (t3-t2)/1e6; \
  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \
                  "reference_"#KERNEL, precision.c_str(), secs_ref, clov_gflop_total/secs_ref, clov_gbyte_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \
  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", /* to see how well the ET performs */  \
                  "reference_"#KERNEL"_performed", precision.c_str(), secs_ref, clov_gflop_performed_total/secs_ref, clov_gbyte_performed_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \
 \
  /* warmup + measure compact clover */ \
  for(auto n : {1, 2, 3, 4, 5}) Dwc_compact.KERNEL(src, res); \
  double t4 = usecond(); \
  for(int n = 0; n < nIter; n++) Dwc_compact.KERNEL(src, res); \
  double t5 = usecond(); \
  double secs_res = (t5-t4)/1e6; \
  grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \
                  "compact_"#KERNEL, precision.c_str(), secs_res, clov_gflop_total/secs_res, clov_gbyte_total/secs_res, secs_ref/secs_res, secs_res/secs_hop); \
  assert(resultsAgree(ref, res, #KERNEL)); \
 }
  BENCH_CLOVER_KERNEL(Mooee);
  BENCH_CLOVER_KERNEL(MooeeDag);
  BENCH_CLOVER_KERNEL(MooeeInv);
  BENCH_CLOVER_KERNEL(MooeeInvDag);
  grid_printf_msg("finalize %s\n", precision.c_str());
 }
 int main(int argc, char** argv) {
  Grid_init(&argc, &argv);
  runBenchmark<vComplexD>(&argc, &argv);
  runBenchmark<vComplexF>(&argc, &argv);
  Grid_finalize();
 }
@@ -235,7 +235,6 @@ void  TestWhat(What & Ddwf,
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
@@ -215,7 +215,6 @@ int main (int argc, char ** argv)
    pickCheckerboard(Odd , chi_o, chi);
    pickCheckerboard(Even, phi_e, phi);
    pickCheckerboard(Odd , phi_o, phi);
    RealD t1,t2;
    SchurDiagMooeeOperator<DomainWallEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
    HermOpEO.MpcDagMpc(chi_e, dchi_e);
@@ -212,8 +212,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
@@ -181,8 +181,8 @@ void checkAdj(const Gamma::Algebra a)
 void checkProject(GridSerialRNG &rng)
 {
-  SpinVector     rv, recon, full;
+  SpinVector     rv, recon;
-  HalfSpinVector hsp, hsm;
+  HalfSpinVector hsm;
  random(rng, rv);
@@ -198,7 +198,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<GparityWilsonFermionR,FermionField> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
@@ -364,14 +364,12 @@ int main(int argc, char **argv) {
      {  // Peek-ology and Poke-ology, with a little app-ology
        Complex c;
-        ColourMatrix c_m;
+        ColourMatrix c_m = Zero();
-        SpinMatrix s_m;
+        SpinMatrix s_m = Zero();
-        SpinColourMatrix sc_m;
+        SpinColourMatrix sc_m = Zero();
-        s_m = TensorIndexRecursion<ColourIndex>::traceIndex(
+        s_m = TensorIndexRecursion<ColourIndex>::traceIndex(sc_m);  // Map to traceColour
-            sc_m);  // Map to traceColour
+        c_m = TensorIndexRecursion<SpinIndex>::traceIndex(sc_m);  // map to traceSpin
        c_m = TensorIndexRecursion<SpinIndex>::traceIndex(
            sc_m);  // map to traceSpin
        c = TensorIndexRecursion<SpinIndex>::traceIndex(s_m);
        c = TensorIndexRecursion<ColourIndex>::traceIndex(c_m);
@@ -217,7 +217,6 @@ int main (int argc, char ** argv)
    pickCheckerboard(Odd , chi_o, chi);
    pickCheckerboard(Even, phi_e, phi);
    pickCheckerboard(Odd , phi_o, phi);
    RealD t1,t2;
    SchurDiagMooeeOperator<MobiusEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
    HermOpEO.MpcDagMpc(chi_e, dchi_e);
@@ -262,7 +262,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<MobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
@@ -144,7 +144,7 @@ int main (int argc, char ** argv)
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
-  double t2;
+
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
@@ -162,7 +162,6 @@ int main (int argc, char ** argv)
  }
  double t1=usecond();
  double t2;
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
@@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 ;
 int main (int argc, char ** argv)
 {
@@ -135,7 +134,6 @@ int main (int argc, char ** argv)
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
  double t2;
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
@@ -204,7 +204,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
@@ -205,7 +205,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
@@ -276,7 +276,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<ZMobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
@@ -57,7 +57,6 @@ int main (int argc, char ** argv)
  SU<Nc>::HotConfiguration(pRNG,U);
  double beta = 1.0;
  double c1   = -0.331;
  IwasakiGaugeActionR Action(beta);
  //  PlaqPlusRectangleActionR Action(beta,c1);
@@ -40,6 +40,7 @@ using namespace Grid;
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -67,6 +68,8 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator ();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -56,9 +56,9 @@ template<class Field> class SolverWrapper : public LinearFunction<Field> {
 private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
-public:
+public: 
-
+  using LinearFunction<Field>::operator();
-  /////////////////////////////////////////////////////
+ /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
  SolverWrapper(CheckerBoardedSparseMatrixBase<Field> &Matrix,
@@ -75,6 +75,7 @@ public:
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -98,6 +99,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -128,6 +130,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,8 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -56,6 +56,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -79,6 +80,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,7 @@ public:
 template<class Field,class Matrix> class RedBlackSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  RealD tol;
@@ -134,6 +137,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -241,7 +245,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
  const int Ls=16;
-  const int rLs=8;
+  //  const int rLs=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -388,7 +392,7 @@ int main (int argc, char ** argv)
  //  RedBlackSmoother<LatticeFermion,DomainWallFermionR> FineRBSmoother(0.00,0.001,100,Ddwf);
  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
 			    HermIndefOp,Ddwf,
 			    FineSmoother,
@@ -57,7 +57,7 @@ private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@@ -75,6 +75,7 @@ public:
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -98,6 +99,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -128,6 +130,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -57,6 +57,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
@@ -118,6 +119,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -174,6 +176,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -456,8 +456,8 @@ public:
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //    int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    autoView(in_v ,   in, AcceleratorRead);
    autoView(st, Stencil, AcceleratorRead);
@@ -471,7 +471,7 @@ public:
 	  typedef decltype(coalescedRead(in_v[0])) calcVector;
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int sU = sF/Ls;
-	  int  s = sF%Ls;
+	  //	  int  s = sF%Ls;
 	  calcComplex res = Zero();
 	  calcVector  nbr;
@@ -517,14 +517,14 @@ public:
    autoView(st, Stencil, AcceleratorRead);
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //    int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
      typedef decltype(coalescedRead(in_v[0])) calcVector;
      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
      int sU = sF/Ls;
-      int  s = sF%Ls;
+      //      int  s = sF%Ls;
      calcComplex res = Zero();
@@ -650,7 +650,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@@ -712,6 +712,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -735,6 +736,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@@ -831,6 +833,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@@ -1174,18 +1177,18 @@ int main (int argc, char ** argv)
  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
  int cNconv;
  cNm=0;
  std::vector<RealD>          eval2(cNm);
  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
  cc_src=1.0;
  //  int cNconv;
  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);
  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(0.02,10000);
  DeflatedGuesser<CoarseCoarseVector> DeflCoarseCoarseGuesser(evec2,eval2);
  NormalEquations<CoarseCoarseVector> DeflCoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,DeflCoarseCoarseGuesser);
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@@ -456,8 +456,8 @@ public:
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    autoView(in_v ,   in, AcceleratorRead);
    autoView(st, Stencil, AcceleratorRead);
@@ -471,7 +471,7 @@ public:
 	  typedef decltype(coalescedRead(in_v[0])) calcVector;
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int sU = sF/Ls;
-	  int  s = sF%Ls;
+	  //	  int  s = sF%Ls;
 	  calcComplex res = Zero();
 	  calcVector  nbr;
@@ -517,14 +517,14 @@ public:
    autoView(st, Stencil, AcceleratorRead);
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //    int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
      typedef decltype(coalescedRead(in_v[0])) calcVector;
      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
      int sU = sF/Ls;
-      int  s = sF%Ls;
+      //      int  s = sF%Ls;
      calcComplex res = Zero();
@@ -648,7 +648,7 @@ private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@@ -669,6 +669,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
@@ -731,6 +732,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -754,6 +756,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@@ -850,7 +853,8 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
-
+  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
@@ -1194,11 +1198,11 @@ int main (int argc, char ** argv)
  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
  int cNconv;
  cNm=0;
  std::vector<RealD>          eval2(cNm);
  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
  cc_src=1.0;
  //  int cNconv;
  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);
  std::vector<RealD> tols ({0.005,0.001});
@@ -1218,10 +1222,10 @@ int main (int argc, char ** argv)
  for(auto c_hi : c_his ) {
  for(auto f_lo : f_los ) {
  for(auto f_hi : f_his ) {
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+    //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
-  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
+    //  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(tol,10000);
-  ZeroGuesser<CoarseCoarseVector> CoarseCoarseGuesser;
+  //  ZeroGuesser<CoarseCoarseVector> CoarseCoarseGuesser;
  SchurRedBlackDiagMooeeSolve<CoarseCoarseVector> CoarseCoarseRBCG(CoarseCoarseCG);
  SchurSolverWrapper<CoarseCoarseVector> CoarseCoarseSolver(cc_Dwf,CoarseCoarseRBCG);
@@ -143,6 +143,7 @@ public:
 template<class Field> class MultiGridPreconditionerBase : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual ~MultiGridPreconditionerBase()               = default;
  virtual void setup()                                 = 0;
  virtual void operator()(Field const &in, Field &out) = 0;
@@ -156,6 +157,7 @@ public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
  using MultiGridPreconditionerBase<Lattice<Fobj>>::operator();
  // clang-format off
  typedef Aggregation<Fobj, CComplex, nBasis>                                                                         Aggregates;
@@ -568,6 +570,7 @@ public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
  using MultiGridPreconditionerBase<Lattice<Fobj>>::operator();
  typedef Matrix        FineDiracMatrix;
  typedef Lattice<Fobj> FineVector;
@@ -56,7 +56,6 @@ int main (int argc, char ** argv)
  QuasiMinimalResidual<LatticeFermion> QMR(1.0e-8,10000);
  RealD mass=0.0;
  RealD M5=1.8;
  WilsonFermionR Dw(Umu,*Grid,*rbGrid,mass);
  NonHermitianLinearOperator<WilsonFermionR,LatticeFermion> NonHermOp(Dw);
		`@@ -0,0 +1 @@`
							`../CompactWilsonCloverFermionInstantiation.cc.master`