Merge pull request #396 from fjosw/fix/readd_config.h

fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am
2025-06-23 02:02:02 +01:00 · 2022-05-09 08:26:48 -04:00 · 2022-05-09 11:53:22 +01:00 · 2022-05-03 08:55:48 -04:00 · 2022-05-03 08:52:18 -04:00 · 2022-05-03 08:51:10 -04:00
136 changed files with 6142 additions and 1558 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];
 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
  virtual ~LinearOperatorBase(){};
 };
@ -507,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@ -585,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@ -598,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+template<class Field> using Preconditioner =  LinearFunction<Field> ;
 /*
 template<class Field> class Preconditioner :  public LinearFunction<Field> {
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
 */
 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
  virtual ~SparseMatrixBase() {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
@ -57,6 +60,7 @@ private:
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@ -87,6 +91,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@ -67,6 +67,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@ -97,6 +98,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;
@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
@ -170,7 +171,7 @@ public:
    LinalgTimer.Start();
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    //p[0],q[0],qq[0] 
@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);
      LinalgTimer.Start();
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
-  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
+  std::cout << " MemoryManager : PrintBytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
  uint64_t cacheBytes;
  cacheBytes = CacheBytes[Cpu];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Acc];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Shared];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
 }
 //////////////////////////////////////////////////////////////////////
@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
-
+uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
  total_device+=bytes;
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
  total_device-=bytes;
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
    total_device-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorFree "<<std::endl;
  PrintBytes();
 #endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
  total_shared+=bytes;
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
  total_shared-=bytes;
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
    total_shared-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #endif
@ -115,7 +159,6 @@ void MemoryManager::Init(void)
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
@ -181,13 +224,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
 #endif
 }
-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@ -211,6 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  if ( entries[v].valid ) {
    ret = entries[v].address;
    cacheBytes -= entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@ -219,6 +263,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
  cacheBytes += bytes;
  return ret;
 }
@ -228,13 +273,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
 #endif
 }
-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@ -243,6 +288,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
      cacheBytes -= entries[e].bytes;
      return entries[e].address;
    }
  }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@ -82,14 +82,15 @@ private:
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
  static uint64_t CacheBytes[NallocType];
  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
  static void PrintBytes(void);
 public:
@ -169,6 +170,7 @@ private:
 public:
  static void Print(void);
  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -3,7 +3,7 @@
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)
@ -429,6 +429,7 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
  PrintBytes();
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
@ -473,6 +474,32 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
  }
 }
 void MemoryManager::PrintState(void* _CpuPtr)
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if ( EntryPresent(CpuPtr) ){
    auto AccCacheIterator = EntryLookup(CpuPtr);
    auto & AccCache = AccCacheIterator->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
    << "\t" << AccCache.cpuLock
    << "\t" << AccCache.accLock
    << "\t" << AccCache.LRU_valid<<std::endl;
  } else {
    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
  }
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@ -16,6 +16,10 @@ uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 void  MemoryManager::PrintState(void* CpuPtr)
 {
 std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 };
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -388,8 +388,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    // TODO : make a OMP loop on CPU, call threaded bcopy
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    acceleratorCopySynchronise(); // MPI prob slower
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@ -400,6 +400,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
  int nreq=list.size();
  if (nreq==0) return;
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -88,6 +88,13 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
  // Helper function to print the state of this object in the AccCache
  void PrintCacheState(void)
  {
    MemoryManager::PrintState(this->_odata);
  }
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
@ -159,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
  return ssum;
 }
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deterministic Reduction operations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 template <class Iterator>
-void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  int device;
 #ifdef GRID_CUDA
@ -37,14 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  
+  /*  
  std::cout << GridLogDebug << "GPU has:" << std::endl;
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  
+  */  
  if (warpSize != WARP_SIZE) {
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
    exit(EXIT_FAILURE);
@ -52,10 +51,14 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
    return 0;
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
-  
+  return 1;
 }
 template <class sobj, class Iterator>
@ -195,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_objectD sobj;
  typedef decltype(lat) Iterator;
@ -204,7 +207,9 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
-  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  assert(ok);
  Integer smemSize = numThreads * sizeof(sobj);
  Vector<sobj> buffer(numBlocks);
@ -215,6 +220,54 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
  auto result = buffer_v[0];
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,1,{
 	buf[ss] = dat[ss*words+w];
      });
    ret_p[w] = sumD_gpu_small(tbuf,osites);
  }
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  Integer nsimd= vobj::Nsimd();
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  if ( ok ) {
    ret = sumD_gpu_small(lat,osites);
  } else {
    ret = sumD_gpu_large(lat,osites);
  }
  return ret;
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -227,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
  return result;
 }
-
+template <class vobj>
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
  autoView(full_v, full, AcceleratorRead);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++) {
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(half_v[ssh],full_v(ss));
    }
  });
 }
 template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
  autoView(full_v , full, AcceleratorWrite);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++){
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(full_v[ss],half_v(ssh));
    }
  });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@ -31,6 +31,7 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include <map>
 #include <pwd.h>
@ -576,6 +577,8 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
  // in principle should do the line below, but that breaks backard compatibility with old data
  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
@ -652,7 +655,8 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    ildgfmt.field     = std::string("su3gauge");
+    const std::string stNC = std::to_string( Nc ) ;
    ildgfmt.field          = std::string("su"+stNC+"gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@ -869,7 +873,8 @@ class IldgReader : public GridLimeReader {
    } else { 
      assert(found_ildgFormat);
-      assert ( ildgFormat_.field == std::string("su3gauge") );
+      const std::string stNC = std::to_string( Nc ) ;
      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@ -877,7 +882,7 @@ class IldgReader : public GridLimeReader {
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
+      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@ -6,8 +6,8 @@
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  const int x=0;
+  assert( Nc < 4 && Nc > 1 ) ;
  const int y=1;
  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+    #if Nc == 2
-    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
-    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
    #else
      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #endif
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@ -278,7 +282,6 @@ struct GaugeSimpleMunger{
 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@ -317,8 +320,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@ -330,8 +333,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@ -9,6 +9,7 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -30,6 +31,8 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <string>
 NAMESPACE_BEGIN(Grid);
 using namespace Grid;
@ -145,15 +148,17 @@ public:
    std::string format(header.floating_point);
-    int ieee32big = (format == std::string("IEEE32BIG"));
+    const int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
+    const int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
+    const int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
+    const int ieee64    = (format == std::string("IEEE64") || \
 			   format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
+    const std::string stNC = std::to_string( Nc ) ;
    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@ -164,7 +169,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
+    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@ -209,27 +214,29 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label);
+    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
-    ///////////////////////////////////////////
+    header.sequence_number = sequence_number;
-    // Following should become arguments
+    header.ensemble_id     = ens_id;
    ///////////////////////////////////////////
    header.sequence_number = 1;
    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
    header.hdr_version     = "1.0" ;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@ -243,10 +250,14 @@ public:
    uint64_t offset;
-    // Sod it -- always write 3x3 double
+    // Sod it -- always write NcxNc double
-    header.floating_point = std::string("IEEE64BIG");
+    header.floating_point  = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+    const std::string stNC = std::to_string( Nc ) ;
-    GaugeSimpleUnmunger<fobj3D,sobj> munge;
+    if( two_row ) {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
    } else {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@ -254,8 +265,15 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+    if( two_row ) {
-					      nersc_csum,scidac_csuma,scidac_csumb);
+      Gauge3x2unmunger<fobj2D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    } else {
      GaugeSimpleUnmunger<fobj3D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@ -287,8 +305,7 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);
-	uint64_t offset;
+    uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@ -328,7 +345,7 @@ public:
    GridBase *grid = parallel.Grid();
-	uint64_t offset = readHeader(file,grid,header);
+    uint64_t offset = readHeader(file,grid,header);
    FieldMetaData clone(header);
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -68,9 +68,16 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  RealD Mass(void) { return mass; };
+  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
-    mass=_mass; 
+    mass_plus=mass_minus=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
    mass_plus=_mass_plus;
    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@ -108,7 +115,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
  //    protected:
-  RealD mass;
+  RealD mass_plus, mass_minus;
  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@ -0,0 +1,432 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 ////////////////////////////////////////////
 // Standard Clover
 //   (4+m0) + csw * clover_term
 // Exp Clover
 //   (4+m0) * exp(csw/(4+m0) clover_term)
 //   = (4+m0) + csw * clover_term + ...
 ////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////
 // Generic Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
    GridBase *grid = CloverTerm.Grid();
    CloverTerm += diag_mass;
    int lvol = grid->lSites();
    int DimRep = Impl::Dimension;
    {
      autoView(CTv,CloverTerm,CpuRead);
      autoView(CTIv,CloverTermInv,CpuWrite);
      thread_for(site, lvol, {
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
        peekLocalSite(Qx, CTv, lcoor);
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++){
                auto zz =  Qx()(j, k)(a, b);
                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
              }
        EigenInvCloverOp = EigenCloverOp.inverse();
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++)
                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
               pokeLocalSite(Qxinv, CTIv, lcoor);
      });
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Generic Exp Clover
 //////////////////////////////////
 template<class Impl>
 class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef WilsonCloverHelpers<Impl> Helpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
  }
 };
 //////////////////////////////////
 // Compact Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
                            public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    Clover += diag_mass;
  }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
                          CloverTriangleField& Triangle,
                          RealD csw_t, RealD diag_mass) {
    // Do nothing
  }
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Compact Exp Clover
 //////////////////////////////////
 template<class Impl>
 class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    // do nothing!
    // mass term is multiplied to exp(Clover) below
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
  	  typedef iMatrix<ComplexD,6> mat;
  	  RealD qn[6];
  	  RealD qnold[6];
  	  RealD p[5];
  	  RealD trA2, trA3, trA4;
  	  mat A2, A3, A4, A5;
  	  A2 = alpha * alpha * arg * arg;
  	  A3 = alpha * arg * A2;
  	  A4 = A2 * A2;
  	  A5 = A2 * A3;
  	  trA2 = toReal( trace(A2) );
  	  trA3 = toReal( trace(A3) );
  	  trA4 = toReal( trace(A4));
  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
  	  p[3] = trA3 / 3.0;
  	  p[4] = 0.5 * trA2;
  	  qnold[0] = cN[Niter];
  	  qnold[1] = 0.0;
  	  qnold[2] = 0.0;
  	  qnold[3] = 0.0;
  	  qnold[4] = 0.0;
  	  qnold[5] = 0.0;
  	  for(int i = Niter-1; i >= 0; i--)
  	  {
  	   qn[0] = p[0] * qnold[5] + cN[i];
  	   qn[1] = p[1] * qnold[5] + qnold[0];
  	   qn[2] = p[2] * qnold[5] + qnold[1];
  	   qn[3] = p[3] * qnold[5] + qnold[2];
  	   qn[4] = p[4] * qnold[5] + qnold[3];
  	   qn[5] = qnold[4];
  	   qnold[0] = qn[0];
  	   qnold[1] = qn[1];
  	   qnold[2] = qn[2];
  	   qnold[3] = qn[3];
  	   qnold[4] = qn[4];
  	   qnold[5] = qn[5];
  	  }
  	  mat unit(1.0);
  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
    }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Diagonal.Grid();
    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
    //
    // Implementation completely in Daniel's layout
    //
    // Taylor expansion with Cayley-Hamilton recursion
    // underlying Horner scheme as above
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++){
      cn[i] = cn[i-1] / RealD(i);
    }
      // Taken over from Daniel's implementation
      conformable(Diagonal, Triangle);
      long lsites = grid->lSites();
      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
      typedef iMatrix<ComplexD,6> mat;
      autoView(diagonal_v,  Diagonal,  CpuRead);
      autoView(triangle_v,  Triangle,  CpuRead);
      autoView(diagonalExp_v, Diagonal, CpuWrite);
      autoView(triangleExp_v, Triangle, CpuWrite);
      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
    	  mat srcCloverOpUL(0.0); // upper left block
    	  mat srcCloverOpLR(0.0); // lower right block
    	  mat ExpCloverOp;
        scalar_object_diagonal diagonal_tmp     = Zero();
        scalar_object_diagonal diagonal_exp_tmp = Zero();
        scalar_object_triangle triangle_tmp     = Zero();
        scalar_object_triangle triangle_exp_tmp = Zero();
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
        peekLocalSite(triangle_tmp, triangle_v, lcoor);
        int block;
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
        		if (i == j){
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
        		}
        		else{
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
        		}
        	}
        }
        block = 1;
        for(int i = 0; i < 6; i++){
          	for(int j = 0; j < 6; j++){
           		if (i == j){
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
           		}
           		else{
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
           		}
            }
        }
        // exp(Clover)
        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
            	if (i == j){
            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
            	}
            	else if(i < j){
            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
            	}
           	}
        }
        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 1;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
              	if (i == j){
              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
               	}
               	else if(i < j){
               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
               	}
            }
        }
        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
      });
    Diagonal = Diagonal * diag_mass;
    Triangle = Triangle * diag_mass;
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@ -0,0 +1,241 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
    Copyright (C) 2020 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer <nils.meyer@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 // see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
 //
 // Modifications done here:
 //
 // Original: clover term = 12x12 matrix per site
 //
 // But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
 // Sufficient to store/transfer only the real parts of the diagonal and one triangular part
 // 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
 //
 // Here: Above but diagonal as complex numbers, i.e., need to store/transfer
 // 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
 //
 // Words per site and improvement compared to original (combined with the input and output spinors):
 //
 // - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
 // - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
 // - Here:     2*12 + 42    =  66 words -> 2.55 x less
 //
 // These improvements directly translate to wall-clock time
 //
 // Data layout:
 //
 // - diagonal and triangle part as separate lattice fields,
 //   this was faster than as 1 combined field on all tested machines
 // - diagonal: as expected
 // - triangle: store upper right triangle in row major order
 // - graphical:
 //        0  1  2  3  4
 //           5  6  7  8
 //              9 10 11 = upper right triangle indices
 //                12 13
 //                   14
 //     0
 //        1
 //           2
 //              3       = diagonal indices
 //                 4
 //                    5
 //     0
 //     1  5
 //     2  6  9          = lower left triangle indices
 //     3  7 10 12
 //     4  8 11 13 14
 //
 // Impact on total memory consumption:
 // - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
 // - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
 template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
  /////////////////////////////////////////////
  // Sizes
  /////////////////////////////////////////////
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  /////////////////////////////////////////////
  // Type definitions
  /////////////////////////////////////////////
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonFermion<Impl>              WilsonBase;
  typedef WilsonCloverHelpers<Impl>        Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  /////////////////////////////////////////////
  // Constructors
  /////////////////////////////////////////////
 public:
  CompactWilsonCloverFermion(GaugeField& _Umu,
 			    GridCartesian& Fgrid,
 			    GridRedBlackCartesian& Hgrid,
 			    const RealD _mass,
 			    const RealD _csw_r = 0.0,
 			    const RealD _csw_t = 0.0,
 			    const RealD _cF = 1.0,
 			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
 			    const ImplParams& impl_p = ImplParams());
  /////////////////////////////////////////////
  // Member functions (implementing interface)
  /////////////////////////////////////////////
 public:
  virtual void Instantiatable() {};
  int          ConstEE()     override { return 0; };
  int          isTrivialEE() override { return 0; };
  void Dhop(const FermionField& in, FermionField& out, int dag) override;
  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
  void M(const FermionField& in, FermionField& out) override;
  void Mdag(const FermionField& in, FermionField& out) override;
  void Meooe(const FermionField& in, FermionField& out) override;
  void MeooeDag(const FermionField& in, FermionField& out) override;
  void Mooee(const FermionField& in, FermionField& out) override;
  void MooeeDag(const FermionField& in, FermionField& out) override;
  void MooeeInv(const FermionField& in, FermionField& out) override;
  void MooeeInvDag(const FermionField& in, FermionField& out) override;
  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  /////////////////////////////////////////////
  // Member functions (internals)
  /////////////////////////////////////////////
  void MooeeInternal(const FermionField&        in,
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle);
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
  void ImportGauge(const GaugeField& _Umu) override;
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
 private:
  template<class Field>
  const MaskField* getCorrectMaskField(const Field &in) const {
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        return &this->BoundaryMaskOdd;
      } else {
        return &this->BoundaryMaskEven;
      }
    } else {
      return &this->BoundaryMask;
    }
  }
  template<class Field>
  void ApplyBoundaryMask(Field& f) {
    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
    assert(m != nullptr);
    CompactHelpers::ApplyBoundaryMask(f, *m);
  }
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
 public:
  RealD csw_r;
  RealD csw_t;
  RealD cF;
  bool open_boundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
  FermionField Tmp;
  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@ -137,21 +138,52 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
-typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
-typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
-typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
-typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
 typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
 typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
 typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@ -4,10 +4,11 @@
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -29,7 +30,9 @@
 #pragma once
-#include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@ -49,19 +52,16 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
-template <class Impl>
+template<class Impl, class CloverHelpers>
-class WilsonCloverFermion : public WilsonFermion<Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  template <typename vtype>
+  INHERIT_CLOVER_TYPES(Impl);
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
-public:
+  typedef WilsonFermion<Impl>       WilsonBase;
-  typedef WilsonFermion<Impl> WilsonBase;
+  typedef WilsonCloverHelpers<Impl> Helpers;
  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@ -72,42 +72,7 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                      const ImplParams &impl_p = ImplParams());
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@ -124,250 +89,21 @@ public:
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
  {
    conformable(X.Grid(), Y.Grid());
    conformable(X.Grid(), force.Grid());
    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
    GaugeField clover_force(force.Grid());
    PropagatorField Lambda(force.Grid());
-    // Guido: Here we are hitting some performance issues:
+public:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
    Impl::extractLinkField(U, this->Umu);
    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 protected:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverField CloverTerm, CloverTermInv;                     // Clover term
-  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
-  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
-  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
-  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
      T_v[i]()(2, 3) = -F_v[i]()();
      T_v[i]()(3, 2) = F_v[i]()();
    });
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
      T_v[i]()(2, 3) = (F_v[i]()());
      T_v[i]()(3, 2) = -(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@ -0,0 +1,763 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 // Helper routines that implement common clover functionality
 NAMESPACE_BEGIN(Grid);
 template<class Impl> class WilsonCloverHelpers {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
  static CloverField fillCloverYZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
    });
    return T;
  }
  static CloverField fillCloverXY(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverYT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverZT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  template<class _Spinor>
  static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
    auto CC = coalescedRead(C);
    mult(&phi, &CC, &chi);
  }
  template<class _SpinorField>
  inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
    const int Nsimd = SiteSpinor::Nsimd();
    autoView(out_v, out, AcceleratorWrite);
    autoView(phi_v, phi, AcceleratorRead);
    autoView(C_v,   C,   AcceleratorRead);
    typedef decltype(coalescedRead(out_v[0])) calcSpinor;
    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
      calcSpinor tmp;
      multClover(tmp,C_v[sss],phi_v(sss));
      coalescedWrite(out_v[sss],tmp);
    });
  }
 };
 ////////////////////////////////////////////////////////
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  #if 0
  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #else
  template<typename vobj>
  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #endif
  static accelerator_inline int triangle_index(int i, int j) {
    if(i == j)
      return 0;
    else if(i < j)
      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
    else // i > j
      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
  }
  static void MooeeKernel_gpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(in_v,       in,       AcceleratorRead);
    autoView(out_v,      out,      AcceleratorWrite);
    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
    const uint64_t NN = Nsite * Ls;
    accelerator_for(ss, NN, Simd::Nsimd(), {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v(sF);
      auto diagonal_t = diagonal_v(sU);
      auto triangle_t = triangle_v(sU);
      for(int block=0; block<Nhs; block++) {
        int s_start = block*Nhs;
        for(int i=0; i<Nred; i++) {
          int si = s_start + i/Nc, ci = i%Nc;
          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
          for(int j=0; j<Nred; j++) {
            if (j == i) continue;
            int sj = s_start + j/Nc, cj = j%Nc;
            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
          };
        };
      };
      coalescedWrite(out_v[sF], res);
    });
  }
  static void MooeeKernel_cpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, CpuRead);
    autoView(triangle_v, triangle, CpuRead);
    autoView(in_v,       in,       CpuRead);
    autoView(out_v,      out,      CpuWrite);
    typedef SiteSpinor CalcSpinor;
 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
 #define PREFETCH_CLOVER(BASE) {                                     \
    uint64_t base;                                                  \
    int pf_dist_L1 = 1;                                             \
    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
                                                                    \
    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
    }                                                               \
                                                                    \
    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
    }                                                               \
  }
 // TODO: Implement/generalize this for other architectures
 // I played around a bit on KNL (see below) but didn't bring anything
 // #elif defined(AVX512)
 // #define PREFETCH_CLOVER(BASE) {                              \
 //     uint64_t base;                                           \
 //     int pf_dist_L1 = 1;                                      \
 //     int pf_dist_L2 = +4;                                     \
 //                                                              \
 //     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
 //     }                                                        \
 //                                                              \
 //     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
 //     }                                                        \
 //   }
 #else
 #define PREFETCH_CLOVER(BASE)
 #endif
    const uint64_t NN = Nsite * Ls;
    thread_for(ss, NN, {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v[sF];
      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
      auto triangle_t = triangle_v[sU];
      // upper half
      PREFETCH_CLOVER(0);
      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
      auto in_cc_1_0 = conjugate(in_t()(1)(0));
      auto in_cc_1_1 = conjugate(in_t()(1)(1));
      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
                  +           triangle_t()(0)( 0) * in_t()(0)(1)
                  +           triangle_t()(0)( 1) * in_t()(0)(2)
                  +           triangle_t()(0)( 2) * in_t()(1)(0)
                  +           triangle_t()(0)( 3) * in_t()(1)(1)
                  +           triangle_t()(0)( 4) * in_t()(1)(2);
      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
                  +           triangle_t()(0)( 5) * in_t()(0)(2)
                  +           triangle_t()(0)( 6) * in_t()(1)(0)
                  +           triangle_t()(0)( 7) * in_t()(1)(1)
                  +           triangle_t()(0)( 8) * in_t()(1)(2)
                  + conjugate(       res()(0)( 1));
      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
                  +           triangle_t()(0)( 5) * in_cc_0_1;
      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
                  +           triangle_t()(0)( 9) * in_t()(1)(0)
                  +           triangle_t()(0)(10) * in_t()(1)(1)
                  +           triangle_t()(0)(11) * in_t()(1)(2)
                  + conjugate(       res()(0)( 2));
      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
                  +           triangle_t()(0)( 6) * in_cc_0_1
                  +           triangle_t()(0)( 9) * in_cc_0_2;
      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
                  +           triangle_t()(0)(12) * in_t()(1)(1)
                  +           triangle_t()(0)(13) * in_t()(1)(2)
                  + conjugate(       res()(1)( 0));
      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
                  +           triangle_t()(0)( 7) * in_cc_0_1
                  +           triangle_t()(0)(10) * in_cc_0_2
                  +           triangle_t()(0)(12) * in_cc_1_0;
      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
                  +           triangle_t()(0)(14) * in_t()(1)(2)
                  + conjugate(       res()(1)( 1));
      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
                  +           triangle_t()(0)( 8) * in_cc_0_1
                  +           triangle_t()(0)(11) * in_cc_0_2
                  +           triangle_t()(0)(13) * in_cc_1_0
                  +           triangle_t()(0)(14) * in_cc_1_1;
      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
                  + conjugate(       res()(1)( 2));
      vstream(out_v[sF]()(0)(0), res()(0)(0));
      vstream(out_v[sF]()(0)(1), res()(0)(1));
      vstream(out_v[sF]()(0)(2), res()(0)(2));
      vstream(out_v[sF]()(1)(0), res()(1)(0));
      vstream(out_v[sF]()(1)(1), res()(1)(1));
      vstream(out_v[sF]()(1)(2), res()(1)(2));
      // lower half
      PREFETCH_CLOVER(1);
      auto in_cc_2_0 = conjugate(in_t()(2)(0));
      auto in_cc_2_1 = conjugate(in_t()(2)(1));
      auto in_cc_2_2 = conjugate(in_t()(2)(2));
      auto in_cc_3_0 = conjugate(in_t()(3)(0));
      auto in_cc_3_1 = conjugate(in_t()(3)(1));
      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
                  +           triangle_t()(1)( 0) * in_t()(2)(1)
                  +           triangle_t()(1)( 1) * in_t()(2)(2)
                  +           triangle_t()(1)( 2) * in_t()(3)(0)
                  +           triangle_t()(1)( 3) * in_t()(3)(1)
                  +           triangle_t()(1)( 4) * in_t()(3)(2);
      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
                  +           triangle_t()(1)( 5) * in_t()(2)(2)
                  +           triangle_t()(1)( 6) * in_t()(3)(0)
                  +           triangle_t()(1)( 7) * in_t()(3)(1)
                  +           triangle_t()(1)( 8) * in_t()(3)(2)
                  + conjugate(       res()(2)( 1));
      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
                  +           triangle_t()(1)( 5) * in_cc_2_1;
      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
                  +           triangle_t()(1)( 9) * in_t()(3)(0)
                  +           triangle_t()(1)(10) * in_t()(3)(1)
                  +           triangle_t()(1)(11) * in_t()(3)(2)
                  + conjugate(       res()(2)( 2));
      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
                  +           triangle_t()(1)( 6) * in_cc_2_1
                  +           triangle_t()(1)( 9) * in_cc_2_2;
      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
                  +           triangle_t()(1)(12) * in_t()(3)(1)
                  +           triangle_t()(1)(13) * in_t()(3)(2)
                  + conjugate(       res()(3)( 0));
      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
                  +           triangle_t()(1)( 7) * in_cc_2_1
                  +           triangle_t()(1)(10) * in_cc_2_2
                  +           triangle_t()(1)(12) * in_cc_3_0;
      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
                  +           triangle_t()(1)(14) * in_t()(3)(2)
                  + conjugate(       res()(3)( 1));
      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
                  +           triangle_t()(1)( 8) * in_cc_2_1
                  +           triangle_t()(1)(11) * in_cc_2_2
                  +           triangle_t()(1)(13) * in_cc_3_0
                  +           triangle_t()(1)(14) * in_cc_3_1;
      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
                  + conjugate(       res()(3)( 2));
      vstream(out_v[sF]()(2)(0), res()(2)(0));
      vstream(out_v[sF]()(2)(1), res()(2)(1));
      vstream(out_v[sF]()(2)(2), res()(2)(2));
      vstream(out_v[sF]()(3)(0), res()(3)(0));
      vstream(out_v[sF]()(3)(1), res()(3)(1));
      vstream(out_v[sF]()(3)(2), res()(3)(2));
    });
  }
  static void MooeeKernel(int                        Nsite,
                          int                        Ls,
                          const FermionField&        in,
                          FermionField&              out,
                          const CloverDiagonalField& diagonal,
                          const CloverTriangleField& triangle) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)
    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
 #else
    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
 #endif
  }
  static void Invert(const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle,
                     CloverDiagonalField&       diagonalInv,
                     CloverTriangleField&       triangleInv) {
    conformable(diagonal, diagonalInv);
    conformable(triangle, triangleInv);
    conformable(diagonal, triangle);
    diagonalInv.Checkerboard() = diagonal.Checkerboard();
    triangleInv.Checkerboard() = triangle.Checkerboard();
    GridBase* grid = diagonal.Grid();
    long lsites = grid->lSites();
    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
    autoView(diagonal_v,  diagonal,  CpuRead);
    autoView(triangle_v,  triangle,  CpuRead);
    autoView(diagonalInv_v, diagonalInv, CpuWrite);
    autoView(triangleInv_v, triangleInv, CpuWrite);
    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      scalar_object_diagonal diagonal_tmp     = Zero();
      scalar_object_diagonal diagonal_inv_tmp = Zero();
      scalar_object_triangle triangle_tmp     = Zero();
      scalar_object_triangle triangle_inv_tmp = Zero();
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
      peekLocalSite(triangle_tmp, triangle_v, lcoor);
      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
              else
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
            }
          }
        }
      }
      clover_inv_eigen = clover_eigen.inverse();
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else if(i < j)
                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else
                continue;
            }
          }
        }
      }
      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
    });
  }
  static void ConvertLayout(const CloverField&   full,
                            CloverDiagonalField& diagonal,
                            CloverTriangleField& triangle) {
    conformable(full, diagonal);
    conformable(full, triangle);
    diagonal.Checkerboard() = full.Checkerboard();
    triangle.Checkerboard() = full.Checkerboard();
    autoView(full_v,     full,     AcceleratorRead);
    autoView(diagonal_v, diagonal, AcceleratorWrite);
    autoView(triangle_v, triangle, AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else if(i < j)
                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else
                continue;
            }
          }
        }
      }
    });
  }
  static void ConvertLayout(const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverField&               full) {
    conformable(full, diagonal);
    conformable(full, triangle);
    full.Checkerboard() = diagonal.Checkerboard();
    full = Zero();
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(full_v,     full,     AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
              else
                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
            }
          }
        }
      }
    });
  }
  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
    // Checks/grid
    double t0 = usecond();
    conformable(diagonal, triangle);
    GridBase* grid = diagonal.Grid();
    // Determine the boundary coordinates/sites
    double t1 = usecond();
    int t_dir = Nd - 1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    // Set off-diagonal parts at boundary to zero -- OK
    double t2 = usecond();
    CloverTriangleField zeroTriangle(grid);
    zeroTriangle.Checkerboard() = triangle.Checkerboard();
    zeroTriangle = Zero();
    triangle = where(t_coor == 0,   zeroTriangle, triangle);
    triangle = where(t_coor == T-1, zeroTriangle, triangle);
    // Set diagonal to unity (scaled correctly) -- OK
    double t3 = usecond();
    CloverDiagonalField tmp(grid);
    tmp.Checkerboard() = diagonal.Checkerboard();
    tmp                = -1.0 * csw_t + diag_mass;
    diagonal           = where(t_coor == 0,   tmp, diagonal);
    diagonal           = where(t_coor == T-1, tmp, diagonal);
    // Correct values next to boundary
    double t4 = usecond();
    if(cF != 1.0) {
      tmp = cF - 1.0;
      tmp += diagonal;
      diagonal = where(t_coor == 1,   tmp, diagonal);
      diagonal = where(t_coor == T-2, tmp, diagonal);
    }
    // Report timings
    double t5 = usecond();
 #if 0
    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
              << " checks = "          << (t1 - t0) / 1e6
              << ", coordinate = "     << (t2 - t1) / 1e6
              << ", off-diag zero = "  << (t3 - t2) / 1e6
              << ", diagonal unity = " << (t4 - t3) / 1e6
              << ", near-boundary = "  << (t5 - t4) / 1e6
              << ", total = "          << (t5 - t0) / 1e6
              << std::endl;
 #endif
  }
  template<class Field, class Mask>
  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
    conformable(f, m);
    auto grid  = f.Grid();
    const uint32_t Nsite = grid->oSites();
    const uint32_t Nsimd = grid->Nsimd();
    autoView(f_v, f, AcceleratorWrite);
    autoView(m_v, m, AcceleratorRead);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, Nsite, Nsimd, {
      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
    });
  }
  template<class MaskField>
  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
    assert(!full.Grid()->_isCheckerBoarded);
    GridBase* grid = full.Grid();
    int t_dir = Nd-1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    MaskField zeroMask(grid); zeroMask = Zero();
    full = 1.0;
    full = where(t_coor == 0,   zeroMask, full);
    full = where(t_coor == T-1, zeroMask, full);
    pickCheckerboard(Even, even, full);
    pickCheckerboard(Odd,  odd,  full);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@ -0,0 +1,92 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 class WilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteClover;
  typedef Lattice<SiteClover> CloverField;
 };
 template<class Impl>
 class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
  typedef iSinglet<Simd>            SiteMask;
  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
  typedef Lattice<SiteMask>           MaskField;
 };
 #define INHERIT_CLOVER_TYPES(Impl)                                 \
  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
 #define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
  /* ugly duplication but needed inside functionality classes */ \
  template<typename vtype> using iImplCloverDiagonal = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
  template<typename vtype> using iImplCloverTriangle = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
 #define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass(_mass)
+  mass_plus(_mass), mass_minus(_mass)
 { 
 }
@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass*cee[Ls-1];
+      lower[s] = mass_minus*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass*cee[0];
+      upper[s] = mass_plus*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_plus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass*cs[Ls-1];
+      lower[s] =-mass_minus*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass*cs[0];
+      upper[s] =-mass_plus*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      leem[i]=mass*cee[Ls-1]/bee[0];
+      leem[i]=mass_minus*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      ueem[i]=mass;
+      ueem[i]=mass_plus;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
  { 
-    Coeff_t delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass_minus*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@ -642,6 +642,10 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@ -777,6 +781,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
@ -828,6 +834,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@ -880,7 +887,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@ -890,7 +897,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@ -934,7 +941,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
      unsigned int t0 = 0;
      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
    } else {
      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@ -0,0 +1,371 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
                                                                            GridCartesian& Fgrid,
                                                                            GridRedBlackCartesian& Hgrid,
                                                                            const RealD _mass,
                                                                            const RealD _csw_r,
                                                                            const RealD _csw_t,
                                                                            const RealD _cF,
                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
                                                                            const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
  , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
  , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
  , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
  , Tmp(&Fgrid)
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
  if (open_boundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
    } else {
      MooeeInternal(in, out, DiagonalEven, TriangleEven);
    }
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
    } else {
      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
    }
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!open_boundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
        continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  out.Checkerboard() = in.Checkerboard();
  conformable(in, out);
  conformable(in, diagonal);
  conformable(in, triangle);
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
  double t0 = usecond();
  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
  // Initialize temporary variables
  double t1 = usecond();
  conformable(_Umu.Grid(), this->GaugeGrid());
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  double t3 = usecond();
  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
  // Handle mass term based on clover policy
  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
  // Convert the data layout of the clover term
  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Exponentiate the clover (nothing happens in case of the standard clover)
  double t5 = usecond();
  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
  // Possible modify the boundary values
  double t6 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
  double t7 = usecond();
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
  // Fill the remaining clover fields
  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
  double t9 = usecond();
  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@ -2,12 +2,13 @@
    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -33,9 +34,48 @@
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
                                               const RealD                         _csw_r,
                                               const RealD                         _csw_t,
                                               const WilsonAnisotropyCoefficients& clover_anisotropy,
                                               const ImplParams&                   impl_p)
  : WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , CloverTerm(&Fgrid)
  , CloverTermInv(&Fgrid)
  , CloverTermEven(&Hgrid)
  , CloverTermOdd(&Hgrid)
  , CloverTermInvEven(&Hgrid)
  , CloverTermInvOdd(&Hgrid)
  , CloverTermDagEven(&Hgrid)
  , CloverTermDagOdd(&Hgrid)
  , CloverTermInvDagEven(&Hgrid)
  , CloverTermInvDagOdd(&Hgrid) {
  assert(Nd == 4); // require 4 dimensions
  if(clover_anisotropy.isAnisotropic) {
    csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
    diag_mass = 4.0 + _mass;
  }
  csw_t = _csw_t * 0.5;
  if(csw_r == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
  if(csw_t == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
  ImportGauge(_Umu);
 }
 // *NOT* EO
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@ -49,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@ -64,13 +104,16 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
  double t1 = usecond();
  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  double t2 = usecond();
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@ -79,52 +122,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  double t3 = usecond();
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm  = Helpers::fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
+   
-
+  double t4 = usecond();
-  int lvol = _Umu.Grid()->lSites();
+  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
  int DimRep = Impl::Dimension;
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    thread_for(site, lvol, {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++){
 	      auto zz =  Qx()(j, k)(a, b);
 	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 	    }
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
      EigenInvCloverOp = EigenCloverOp.inverse();
      //std::cout << EigenInvCloverOp << std::endl;
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
    });
  }
  double t5 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@ -137,37 +148,47 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
  double t6 = usecond();
  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
+  CloverField *Clover;
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  if (dag)
@ -182,12 +203,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
+      Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
    }
  }
  else
@ -205,29 +226,109 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
  }
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
 template<class Impl, class CloverHelpers>
 void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
      continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2015
+Copyright (C) 2022
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -599,11 +600,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp_shifted(UGrid);
  PropagatorField g5Lg5(UGrid);
  PropagatorField R(UGrid);
  PropagatorField gmuR(UGrid);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  g5Lg5=g5*q_in_1*g5;
  tmp_shifted=Cshift(q_in_2,mu,1);
  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
  gmuR=gmu*R;
  q_out=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
  tmp_shifted=Cshift(q_in_1,mu,1);
  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
  g5Lg5=g5*g5Lg5*g5;
  R=q_in_2;
  gmuR=gmu*R;
  q_out-=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
 }
@ -617,9 +654,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  PropagatorField L(UGrid);
  PropagatorField zz (UGrid);
  zz=Zero();
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  tmp = Cshift(q_in,mu,1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
  tmp = q_in *lattice_cmplx;
  tmp = Cshift(tmp,mu,-1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
  tmp = -( Utmp + gmu*Utmp );
  // Mask the time
  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
    unsigned int t0 = 0;
    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
  } else {
    tmp = where((lcoor>=tmin+tshift),tmp,zz);
  }
  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_32=ref()(3)(2);}
 #define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);	\
+  permute##dir(Chi_00,Chi_00);			\
-      permute##dir(Chi_01,Chi_01);\
+  permute##dir(Chi_01,Chi_01);			\
-      permute##dir(Chi_02,Chi_02);\
+  permute##dir(Chi_02,Chi_02);			\
-      permute##dir(Chi_10,Chi_10);	\
+  permute##dir(Chi_10,Chi_10);			\
-      permute##dir(Chi_11,Chi_11);\
+  permute##dir(Chi_11,Chi_11);			\
-      permute##dir(Chi_12,Chi_12);
+  permute##dir(Chi_12,Chi_12);
 #endif
@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;
 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  {int ptype;					\
-  offset = SE->_offset;				\
+   SE=st.GetEntry(ptype,DIR,ss);		\
-  local  = SE->_is_local;			\
+   auto offset = SE->_offset;			\
-  perm   = SE->_permute;			\
+   auto local  = SE->_is_local;			\
-  if ( local ) {				\
+   auto perm   = SE->_permute;			\
-    LOAD_CHIMU(PERM);				\
+   if ( local ) {				\
-    PROJ;					\
+     LOAD_CHIMU(PERM);				\
-    if ( perm) {				\
+     PROJ;					\
-      PERMUTE_DIR(PERM);			\
+     if ( perm) {				\
-    }						\
+       PERMUTE_DIR(PERM);			\
-  } else {					\
+     }						\
-    LOAD_CHI;					\
+   } else {					\
-  }						\
+     LOAD_CHI;					\
-  acceleratorSynchronise();			\
+   }						\
-  MULT_2SPIN(DIR);				\
+   acceleratorSynchronise();			\
-  RECON;					
+   MULT_2SPIN(DIR);				\
   RECON;					}
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
-  SE=&st_p[DIR+8*ss];				\
+  { SE=&st_p[DIR+8*ss];						\
-  ptype=st_perm[DIR];				\
+  auto ptype=st_perm[DIR];					\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;					\
-  local  = SE->_is_local;			\
+  auto local  = SE->_is_local;					\
-  perm   = SE->_permute;			\
+  auto perm   = SE->_permute;					\
-  if ( local ) {				\
+  if ( local ) {						\
-    LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);						\
-    PROJ;					\
+    PROJ;							\
-    if ( perm) {				\
+    if ( perm) {						\
-      PERMUTE_DIR(PERM);			\
+      PERMUTE_DIR(PERM);					\
-    }						\
+    }								\
-  } else {					\
+  } else {							\
-    LOAD_CHI;					\
+    LOAD_CHI;							\
-  }						\
+  }								\
-  acceleratorSynchronise();			\
+  acceleratorSynchronise();					\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN(DIR);						\
-  RECON;					
+  RECON;					}
 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
+  { SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
+    auto ptype=st_perm[DIR];						\
- /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
+    auto offset = SE->_offset;						\
-  perm   = SE->_permute;			\
+    auto perm   = SE->_permute;						\
-  LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);							\
-  PROJ;						\
+    PROJ;								\
-  MULT_2SPIN(DIR);				\
+    MULT_2SPIN(DIR);							\
-  RECON;					
+    RECON;					}
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  local  = SE->_is_local;			\
+  auto offset = SE->_offset;					\
-  perm   = SE->_permute;			\
+  auto local  = SE->_is_local;					\
-  if ( local ) {				\
+  auto perm   = SE->_permute;					\
-    LOAD_CHIMU(PERM);				\
+  if ( local ) {						\
-    PROJ;					\
+    LOAD_CHIMU(PERM);						\
-    if ( perm) {				\
+    PROJ;							\
-      PERMUTE_DIR(PERM);			\
+    if ( perm) {						\
-    }						\
+      PERMUTE_DIR(PERM);					\
-  } else if ( st.same_node[DIR] ) {		\
+    }								\
-    LOAD_CHI;					\
+  } else if ( st.same_node[DIR] ) {				\
-  }						\
+    LOAD_CHI;							\
-  acceleratorSynchronise();			\
+  }								\
-  if (local || st.same_node[DIR] ) {		\
+  acceleratorSynchronise();					\
-    MULT_2SPIN(DIR);				\
+  if (local || st.same_node[DIR] ) {				\
-    RECON;					\
+    MULT_2SPIN(DIR);						\
-  }						\
+    RECON;							\
-  acceleratorSynchronise();			
+  }								\
  acceleratorSynchronise();			}
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+  auto offset = SE->_offset;				\
-    LOAD_CHI;					\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
-    MULT_2SPIN(DIR);				\
+    LOAD_CHI;						\
-    RECON;					\
+    MULT_2SPIN(DIR);					\
-    nmu++;					\
+    RECON;						\
-  }						\
+    nmu++;						\
-  acceleratorSynchronise();			
+  }							\
  acceleratorSynchronise();			}
-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss)					\
-  {						\
+  {							\
-    SiteSpinor & ref (out[ss]);			\
+    SiteSpinor & ref (out[ss]);				\
    coalescedWrite(ref()(0)(0),result_00,lane);		\
    coalescedWrite(ref()(0)(1),result_01,lane);		\
    coalescedWrite(ref()(0)(2),result_02,lane);		\
@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
@ -0,0 +1,44 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
@ -8,7 +8,8 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@ -31,10 +32,12 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION>; 
+template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
 template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
@ -0,0 +1 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
@ -0,0 +1 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@ -18,6 +18,10 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "
 COMPACT_WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD "
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@ -46,7 +50,17 @@ for impl in $WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
 CC_LIST="CompactWilsonCloverFermionInstantiation"
 for impl in $COMPACT_WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
@ -63,14 +77,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
 # overwrite the .cc file in Gparity directories
 for impl in $GDWF_IMPL_LIST
 do
-  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc 
+  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
 done
@ -84,7 +98,7 @@ for impl in $STAG_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
--- a/Grid/qcd/hmc/UsingHMC.md
+++ b/Grid/qcd/hmc/UsingHMC.md
@ -1,61 +1,63 @@
-Using HMC in Grid version 0.5.1
+# Using HMC in Grid
-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
+These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
+Disclaimer: Grid is still under active development so any information here can be changed in future releases.
-Command line options
+## Command line options
-===================
+
-(relevant file GenericHMCrunner.h)
+(relevant file `GenericHMCrunner.h`)
 The initial configuration can be changed at the command line using 
--StartType <your choice>
+`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
-valid choices, one among these
+`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
-HotStart, ColdStart, TepidStart, CheckpointStart
+Default: `--StartingType HotStart`
 default: HotStart
-example
+Example:
-./My_hmc_exec  --StartType HotStart
+```
 ./My_hmc_exec  --StartingType HotStart
 ```
-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
+The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
+`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
-default: 0
+Default: `--StartingTrajectory 0`
 The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
+`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
-default: 1
+Default: `--Trajectories 1`
 The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
+`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
-default: 10
+Default: `--Thermalizations 10`
 Any other parameter is defined in the source for the executable.
-HMC controls
+## HMC controls
 ===========
 The lines 
 ```
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
  std::vector<int> ParSeed({6, 7, 8, 9, 10});
 ```
 define the seeds for the serial and the parallel RNG.
 The line 
 ```
  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
 ```
 declares the number of molecular dynamics steps and the total trajectory length.
-Actions
+## Actions
 ======
-Action names are defined in the file
+Action names are defined in the directory `Grid/qcd/action`.
 lib/qcd/Actions.h
-Gauge actions list:
+Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):
 ```
 WilsonGaugeActionR;
 WilsonGaugeActionF;
 WilsonGaugeActionD;
@ -68,8 +70,9 @@ IwasakiGaugeActionD;
 SymanzikGaugeActionR;
 SymanzikGaugeActionF;
 SymanzikGaugeActionD;
 ```
-
+```
 ConjugateWilsonGaugeActionR;
 ConjugateWilsonGaugeActionF;
 ConjugateWilsonGaugeActionD;
@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
 ConjugateSymanzikGaugeActionR;
 ConjugateSymanzikGaugeActionF;
 ConjugateSymanzikGaugeActionD;
 ```
 Each of these action accepts one single parameter at creation time (beta).
 Example for creating a Symanzik action with beta=4.0
 ```
  SymanzikGaugeActionR(4.0)
 ```
 Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
 ```
 ScalarActionR;
 ScalarActionF;
 ScalarActionD;
 ```
-
+The suffixes `R`, `F`, `D` in the action names refer to the `Real`
-each of these action accept one single parameter at creation time (beta).
+(the precision is defined at compile time by the `--enable-precision` flag in the configure),
-Example for creating a Symanzik action with beta=4.0
+`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
 	SymanzikGaugeActionR(4.0)
 The suffixes R,F,D in the action names refer to the Real
 (the precision is defined at compile time by the --enable-precision flag in the configure),
 Float and Double, that force the precision of the action to be 32, 64 bit respectively.
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@ -55,12 +55,12 @@ public:
    }
  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
-    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
+    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
  }
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
    GridBase *grid = Umu.Grid();
@ -122,6 +122,8 @@ public:
      }
    }
    std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
    if (err_on_no_converge) assert(0);
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -125,7 +125,6 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
@ -165,7 +164,7 @@ public:
    double vol = Umu.Grid()->gSites();
-    return p.real() / vol / 4.0 / 3.0;
+    return p.real() / vol / (4.0 * Nc ) ;
  };
  //////////////////////////////////////////////////
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -322,8 +322,8 @@ public:
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
-    int recv_from_rank;
+    //    int recv_from_rank;
-    int xmit_to_rank;
+    //    int xmit_to_rank;
    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }
-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }
-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -74,29 +74,43 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_DEFAULT_GPU
  int device = 0;
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
-    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: assume user either uses\n");
    printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
 #else
  int device = rank;
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
  printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
  cudaSetDevice(rank);
 #endif
  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
    cudaDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@ -154,16 +168,25 @@ void acceleratorInit(void)
 #ifdef GRID_DEFAULT_GPU
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: using default device \n");
-    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
+    printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
-    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
  }
  int device = 0;
 #else
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
-    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
  }
-  hipSetDevice(rank);
+  int device = rank;
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
    hipDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 }
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 #ifdef GRID_CUDA
 #include <cuda.h>
 #ifdef __CUDA_ARCH__
@ -115,6 +116,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific
 inline void cuda_mem(void)
 {
  size_t free_t,total_t,used_t;
  cudaMemGetInfo(&free_t,&total_t);
  used_t=total_t-free_t;
  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
 }
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@ -221,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@ -297,7 +307,7 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {
  theGridAccelerator->memcpy(to,from,bytes);
 }
-inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
@ -328,10 +338,11 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline
 extern hipStream_t copyStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-  return hipThreadIdx_z; 
+  return hipThreadIdx_x; 
 #else
  return 0;
 #endif
@ -345,19 +356,41 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      { __VA_ARGS__;}							\
    };									\
    int nt=acceleratorThreads();					\
-    dim3 hip_threads(nt,1,nsimd);					\
+    dim3 hip_threads(nsimd, nt, 1);					 \
-    dim3 hip_blocks ((num1+nt-1)/nt,num2,1);				\
+    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
-    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
+    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
-		       0,0,						\
+      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-		       num1,num2,nsimd,lambda);				\
+            0,0,						\
            num1,num2,nsimd, lambda);				\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
            0,0,						\
            num1,num2,nsimd, lambda);				\
    } \
  }
 template<typename lambda>  __global__
 __launch_bounds__(64,1)
 void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
  // Following the same scheme as CUDA for now
  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
  uint64_t z = threadIdx.x;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
 }
 template<typename lambda>  __global__
 __launch_bounds__(1024,1)
 void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
-  uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
+  // Following the same scheme as CUDA for now
-  uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
-  uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
  uint64_t z = threadIdx.x;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
@ -402,10 +435,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-inline void acceleratorCopySynchronise(void) {  }
+//inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
 #endif
 //////////////////////////////////////////////
@ -442,9 +481,10 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
@ -476,18 +516,12 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
-accelerator_inline void acceleratorSynchronise(void) 
+accelerator_inline void acceleratorSynchronise(void)  // Only Nvidia needs 
 {
 #ifdef GRID_SIMT
 #ifdef GRID_CUDA
  __syncwarp();
 #endif
 #ifdef GRID_SYCL
  //cl::sycl::detail::workGroupBarrier();
 #endif
 #ifdef GRID_HIP
  __syncthreads();
 #endif
 #endif
  return;
 }
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -72,3 +72,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_region                                       DO_PRAGMA(omp parallel)
 #define thread_critical                                     DO_PRAGMA(omp critical)
 #ifdef GRID_OMP
 inline void thread_bcopy(void *from, void *to,size_t bytes)
 {
  uint64_t *ufrom = (uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
  thread_for(w,words,{
      uto[w] = ufrom[w];
  });
 }
 #else
 inline void thread_bcopy(void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
 #endif
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@ -88,7 +88,7 @@ public:
 // Coordinate class, maxdims = 8 for now.
 ////////////////////////////////////////////////////////////////
 #define GRID_MAX_LATTICE_DIMENSION (8)
-#define GRID_MAX_SIMD              (16)
+#define GRID_MAX_SIMD              (32)
 static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
  return;
 }
 void GridCmdOptionFloat(std::string &str,float & val)
 {
  std::stringstream ss(str);
  ss>>val;
  return;
 }
 void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
@ -527,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);
 void GridCmdOptionFloat(std::string &str,float & val);
 void GridParseLayout(char **argv,int argc,
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@ -137,7 +137,7 @@ int main (int argc, char ** argv)
  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
-  double          n = BENCH_IO_NPASS;
+  //  double          n = BENCH_IO_NPASS;
  stats(mean, stdDev, perf);
  stats(avMean, avStdDev, avPerf);
@ -164,7 +164,7 @@ int main (int argc, char ** argv)
                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s\n",
              "L", "std read", "std write", "Grid read", "Grid write");
@ -185,7 +185,7 @@ int main (int argc, char ** argv)
              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
  MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%12s %12s %12s %12s\n",
              "std read", "std write", "Grid read", "Grid write");
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@ -142,7 +142,7 @@ public:
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
-	int ncomm;
+	//	int ncomm;
 	double dbytes;
        for(int dir=0;dir<8;dir++) {
@ -290,7 +290,7 @@ public:
      LatticeSU4 z(&Grid); z=Zero();
      LatticeSU4 x(&Grid); x=Zero();
      LatticeSU4 y(&Grid); y=Zero();
-      double a=2.0;
+      //      double a=2.0;
      uint64_t Nloop=NLOOP;
--- a/benchmarks/Benchmark_comms_host_device.cc
+++ b/benchmarks/Benchmark_comms_host_device.cc
@ -72,7 +72,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
-  time_statistics timestat;
+  //  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@ -126,19 +126,10 @@ int main (int argc, char ** argv)
  // Naive wilson implementation
  ////////////////////////////////////
  // replicate across fifth dimension
-  LatticeGaugeFieldF Umu5d(FGrid);
+  //  LatticeGaugeFieldF Umu5d(FGrid);
-  std::vector<LatticeColourMatrixF> U(4,FGrid);
+  std::vector<LatticeColourMatrixF> U(4,UGrid);
  {
    autoView( Umu5d_v, Umu5d, CpuWrite);
    autoView( Umu_v  , Umu  , CpuRead);
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
      for(int s=0;s<Ls;s++){
 	Umu5d_v[Ls*ss+s] = Umu_v[ss];
      }
    }
  }
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
@ -147,10 +138,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	  }
 	}
      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@ -182,7 +191,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =3000;
+  int ncall =300;
  if (1) {
    FGrid->Barrier();
@ -242,16 +251,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
+	autoView( U_v  , U[mu]  , CpuRead);
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    int i=s+Ls*ss;
 	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
 	  }
 	}
      }
-
+      
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -184,8 +184,10 @@ int main (int argc, char ** argv)
      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
-
+	       <<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
 	       <<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
      assert(nn==nn);
  }    
  Grid_finalize();
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -81,8 +81,8 @@ int main (int argc, char ** argv)
    Vector<Coeff_t> diag = Dw.bs;
    Vector<Coeff_t> upper= Dw.cs;
    Vector<Coeff_t> lower= Dw.cs;
-    upper[Ls-1]=-Dw.mass*upper[Ls-1];
+    upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
-    lower[0]   =-Dw.mass*lower[0];
+    lower[0]   =-Dw.mass_plus*lower[0];
    LatticeFermion r_eo(FGrid);
    LatticeFermion src_e (FrbGrid);
--- a/configure.ac
+++ b/configure.ac
@ -159,7 +159,7 @@ case ${ac_ZMOBIUS} in
 esac
 ############### Nc
 AC_ARG_ENABLE([Nc],
-    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
+    [AC_HELP_STRING([--enable-Nc=2|3|4|5], [enable number of colours])],
    [ac_Nc=${enable_Nc}], [ac_Nc=3])
 case ${ac_Nc} in
--- a/examples/Example_Laplacian_solver.cc
+++ b/examples/Example_Laplacian_solver.cc
@ -4,7 +4,7 @@ using namespace Grid;
 template<class Field>
 void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
 {
-    RealD cp, c, alpha, d, beta, ssq, qq;
+    RealD cp, c, alpha, d, beta, ssq;
    RealD Tolerance=1.0e-10;
    int MaxIterations=10000;
--- a/examples/Example_wall_wall_3pt.cc
+++ b/examples/Example_wall_wall_3pt.cc
@ -0,0 +1,539 @@
 /*
 * Warning: This code illustrative only: not well tested, and not meant for production use
 * without regression / tests being applied
 */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out + 2.0*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 void MakePhase(Coordinate mom,LatticeComplex &phase)
 {
  GridBase *grid = phase.Grid();
  auto latt_size = grid->GlobalDimensions();
  ComplexD ci(0.0,1.0);
  phase=Zero();
  LatticeComplex coor(phase.Grid());
  for(int mu=0;mu<Nd;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    phase = phase + (TwoPiL * mom[mu]) * coor;
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
  source=Zero();
  SpinColourMatrix kronecker; kronecker=1.0;
  pokeSite(kronecker,source,coor);
 }
 void GFWallSource(int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex one(grid); one = ComplexD(1.0,0.0);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  LatticeCoordinate(t,Tdir);
  one = where(t==Integer(tslice), one, zz);
  source = 1.0;
  source = source * one;
 }
 void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex noise(grid);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  RealD nrm=1.0/sqrt(2);
  bernoulli(RNG, noise); // 0,1 50:50
  noise = (2.*noise - Complex(1,1))*nrm;
  LatticeCoordinate(t,Tdir);
  noise = where(t==Integer(tslice), noise, zz);
  source = 1.0;
  source = source*noise;
  std::cout << " Z2 wall " << norm2(source) << std::endl;
 }
 void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
  Real width = 2.0;
  Real coeff = (width*width) / Real(4*Iterations);
  Field tmp(U.Grid());
  smeared=unsmeared;
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(smeared,tmp);
    smeared = smeared - coeff*tmp;
    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
  }
 }
 void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
 {
  LatticePropagator tmp(source.Grid());
  PointSource(site,source);
  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
  tmp = source;
  GaussianSmear(U,tmp,source);
  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
 }
 void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
 {
  Z2WallSource(RNG,tslice,source);
  auto tmp = source;
  GaussianSmear(U,tmp,source);
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
  assert(mom.size()==Nd);
  assert(mom[Tdir] == 0);
  GridBase * grid = spectator.Grid();
  LatticeInteger ts(grid);
  LatticeCoordinate(ts,Tdir);
  source = Zero();
  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
  LatticeComplex phase(grid);
  MakePhase(mom,phase);
  source = source *phase;
 }
 template<class Action>
 void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
 {
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
  LatticeFermion src4  (UGrid); 
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
      D.ImportPhysicalFermionSource(src4,src5);
      result5=Zero();
      schur(D,src5,result5,ZG);
      std::cout<<GridLogMessage
 	       <<"spin "<<s<<" color "<<c
 	       <<" norm2(src5d) "   <<norm2(src5)
               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
      D.ExportPhysicalFermionSolution(result5,result4);
      FermToProp<Action>(propagator,result4,s,c);
    }
  }
 }
 class MesonFile: Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
 };
 void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void Meson3pt(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaX},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaY},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaZ},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaT}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  int nt=q1.size();
  std::vector<Complex> meson_CF(nt);
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
      corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 int make_idx(int p, int m,int nmom)
 {
  if (m==0) return p;
  assert(p==0);
  return nmom + m - 1;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  // Double precision grids
  auto latt = GridDefaultLatt();
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  LatticeGaugeField Umu(UGrid);
  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
    FieldMetaData header;
    NerscIO::readConfiguration(Umu, header, argv[1]);
    config=argv[1];
  }
  else
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
    config="ColdConfig";
  }
  //  GaugeFix(Umu,Utmp);
  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  LinkSmear(nsmr,rho,Umu,Usmr);
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
  std::vector<RealD> cs   ({ 0.0,0.0,0.5} );  // DDM
  std::vector<int>   Ls_s ({ 16,16,12} );
  std::vector<GridCartesian *> FGrids;
  std::vector<GridRedBlackCartesian *> FrbGrids;
  std::vector<Coordinate> momenta;
  momenta.push_back(Coordinate({0,0,0,0}));
  momenta.push_back(Coordinate({1,0,0,0}));
  momenta.push_back(Coordinate({2,0,0,0}));
  int nmass = masses.size();
  int nmom  = momenta.size();
  std::vector<MobiusFermionR *> FermActs;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
    RealD mass = masses[m];
    RealD M5   = M5s[m];
    RealD b    = bs[m];
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
  LatticePropagator phased_prop(UGrid);
  int tslice = 0;
  int tseq=(tslice+16)%latt[Nd-1];
  //////////////////////////////////////////////////////////////////////
  // RNG seeded for Z2 wall
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticeComplex> phase(nmom,UGrid);
  for(int m=0;m<nmom;m++){
    MakePhase(momenta[m],phase[m]);
  }
  std::vector<LatticePropagator> Z2Props   (nmom+nmass-1,UGrid);
  std::vector<LatticePropagator> GFProps   (nmom+nmass-1,UGrid);
  for(int p=0;p<nmom;p++) {
    int m=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  for(int m=1;m<nmass;m++) {
    int p=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source;
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source;
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  std::vector<std::vector<Propagator> > wsnk_z2Props(nmom+nmass-1);
  std::vector<std::vector<Propagator> > wsnk_gfProps(nmom+nmass-1);
  // Non-zero kaon and point and D two point
  // WW stick momentum on m1 (lighter)
  //     zero momentum on m2
  for(int m1=0;m1<nmass;m1++) {
  for(int m2=m1;m2<nmass;m2++) {
    int pmax = (m1==0)? nmom:1;
    for(int p=0;p<pmax;p++){
      std::stringstream ssg,ssz;
      std::stringstream wssg,wssz;
      int idx1 = make_idx(p,m1,nmom);
      int idx2 = make_idx(0,m2,nmom);
      /// Point sinks
      ssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
      ssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
      MesonTrace(ssz.str(),Z2Props[idx1],Z2Props[idx2],phase[p]); // Q1 is conjugated
      MesonTrace(ssg.str(),GFProps[idx1],GFProps[idx2],phase[p]); 
      /// Wall sinks
      wssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
      wssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
      phased_prop = GFProps[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(GFProps[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
      phased_prop = Z2Props[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(Z2Props[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
    }
  }}
  /////////////////////////////////////
  // Sequential solves
  /////////////////////////////////////
  LatticePropagator  seq_wsnk_z2src(UGrid);
  LatticePropagator  seq_wsnk_gfsrc(UGrid);
  LatticePropagator  seq_psnk_z2src(UGrid);
  LatticePropagator  seq_psnk_gfsrc(UGrid);
  LatticePropagator source(UGrid);
  for(int m=0;m<nmass-1;m++){
    int spect_idx = make_idx(0,m,nmom);
    int charm=nmass-1;
    SequentialSource(tseq,momenta[0],GFProps[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_gfsrc);
    SequentialSource(tseq,momenta[0],Z2Props[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_z2src);
    // Todo need wall sequential solve
    for(int p=0;p<nmom;p++){
      int active_idx = make_idx(p,0,nmom);
      std::stringstream seq_3pt_p_z2;
      std::stringstream seq_3pt_p_gf;
      std::stringstream seq_3pt_w_z2;
      std::stringstream seq_3pt_w_gf;
      seq_3pt_p_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_z2_meson.xml";
      seq_3pt_p_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_gf_meson.xml";
      seq_3pt_w_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_z2_meson.xml";
      seq_3pt_w_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_gf_meson.xml";
      Meson3pt(seq_3pt_p_gf.str(),GFProps[active_idx],seq_psnk_gfsrc,phase[p]);
      Meson3pt(seq_3pt_p_z2.str(),Z2Props[active_idx],seq_psnk_z2src,phase[p]);
    }    
  }
  Grid_finalize();
 }
--- a/examples/Example_wall_wall_spectrum.cc
+++ b/examples/Example_wall_wall_spectrum.cc
@ -9,6 +9,7 @@ using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
@ -55,6 +56,16 @@ void MakePhase(Coordinate mom,LatticeComplex &phase)
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
@ -97,23 +108,23 @@ void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
-  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Ufix);
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
@ -167,19 +178,21 @@ void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
-  LatticeFermion src4  (UGrid); 
+  LatticeFermion src4  (UGrid); src4 = Zero();
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  std::cout<<GridLogMessage<< " source4 "<<norm2(source)<<std::endl;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
-
+      std::cout<<GridLogMessage<< s<<c<<" src4 "<<norm2(src4)<<std::endl;
      D.ImportPhysicalFermionSource(src4,src5);
      std::cout<<GridLogMessage<< s<<c<<" src5 "<<norm2(src5)<<std::endl;
      result5=Zero();
      schur(D,src5,result5,ZG);
@ -287,15 +300,10 @@ int main (int argc, char ** argv)
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  std::vector<int> seeds4({1,2,3,4}); 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  LatticeGaugeField Ufixed(UGrid);
+  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
@ -308,13 +316,20 @@ int main (int argc, char ** argv)
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
-    //    SU<Nc>::HotConfiguration(RNG4,Umu);
+    config="ColdConfig";
    config="HotConfig";
  }
-  GaugeFix(Umu,Ufixed);
+  //  GaugeFix(Umu,Utmp);
-  Umu=Ufixed;
+  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  RealD plaq_gf =WilsonLoops<GimplR>::avgPlaquette(Umu);
  LinkSmear(nsmr,rho,Umu,Usmr);
  RealD plaq_smr=WilsonLoops<GimplR>::avgPlaquette(Usmr);
  std::cout << GridLogMessage << " GF Plaquette " <<plaq_gf<<std::endl;
  std::cout << GridLogMessage << " SM Plaquette " <<plaq_smr<<std::endl;
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
@ -330,6 +345,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
@ -339,30 +357,40 @@ int main (int argc, char ** argv)
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
-    FermActs.push_back(new MobiusFermionR(Umu,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator point_source(UGrid);
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
-  Coordinate Origin({0,0,0,0});
+  int tslice = 0;
-  PointSource   (Origin,point_source);
+  //////////////////////////////////////////////////////////////////////
-  Z2WallSource  (RNG4,0,z2wall_source);
+  // RNG seeded for Z2 wall
-  GFWallSource  (0,gfwall_source);
+  //////////////////////////////////////////////////////////////////////
-  
+  // You can manage seeds however you like.
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  // Recommend SeedUniqueString.
-  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
+  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  std::vector<LatticePropagator> GFProps   (nmass,UGrid);
  for(int m=0;m<nmass;m++) {
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<<std::endl;
    Solve(*FermActs[m],z2wall_source    ,Z2Props[m]);
    std::cout << GridLogMessage << " Mass " <<m << " gfwall source "<<norm2(gfwall_source)<<std::endl;
    Solve(*FermActs[m],gfwall_source    ,GFProps[m]);
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<< " " << norm2(gfwall_source)<<std::endl;
  }
@ -383,14 +411,15 @@ int main (int argc, char ** argv)
    std::stringstream wssg,wssz;
    /// Point sinks
-    ssg<<config<< "_m" << m1 << "_m"<< m2 << "p_gf_meson.xml";
+    ssg<<config<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "p_z2_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
    MesonTrace(ssg.str(),GFProps[m1],GFProps[m2],phase);
    /// Wall sinks
-    wssg<<config<< "_m" << m1 << "_m"<< m2 << "w_gf_meson.xml";
+    wssg<<config<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
-    wssz<<config<< "_m" << m1 << "_m"<< m2 << "w_z2_meson.xml";
+    wssz<<config<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
    WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
    WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
--- a/systems/Crusher/config-command
+++ b/systems/Crusher/config-command
@ -0,0 +1,12 @@
 ../../configure --enable-comms=mpi-auto \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
 LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
 HIPFLAGS = --amdgpu-target=gfx90a
--- a/systems/Crusher/dwf.slurm
+++ b/systems/Crusher/dwf.slurm
@ -0,0 +1,30 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 ##SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 1
 #SBATCH --exclusive  
 DIR=.
 module list
 #export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=1
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 24.24.24.24 --shm-mpi 0 --mpi 1.1.1.1"
 srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Crusher/dwf4.slurm
+++ b/systems/Crusher/dwf4.slurm
@ -0,0 +1,27 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 4
 #SBATCH --exclusive
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=4
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Crusher/dwf8.slurm
+++ b/systems/Crusher/dwf8.slurm
@ -0,0 +1,27 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 8
 #SBATCH --exclusive
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=1
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads 8 --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun --gpus-per-task 1 -n8 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Crusher/mpiwrapper.sh
+++ b/systems/Crusher/mpiwrapper.sh
@ -0,0 +1,12 @@
 #!/bin/bash
 lrank=$SLURM_LOCALID
 export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID
 echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING"
 $*
--- a/systems/Crusher/sourceme.sh
+++ b/systems/Crusher/sourceme.sh
@ -0,0 +1,5 @@
 module load PrgEnv-gnu
 module load rocm/4.5.0
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
--- a/systems/Spock/comms.slurm
+++ b/systems/Spock/comms.slurm
@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J comms
 #SBATCH -o comms.%J
 #SBATCH -e comms.%J
 #SBATCH -N 1
 #SBATCH -n 2
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 64.64.32.32 --mpi 2.1.1.1 "
 srun -n2 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_comms_host_device $PARAMS
--- a/systems/Spock/config-command
+++ b/systems/Spock/config-command
@ -0,0 +1,12 @@
 ../../configure --enable-comms=mpi-auto \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
 --prefix=/ccs/home/chulwoo/Grid \
 LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
--- a/systems/Spock/dwf.slurm
+++ b/systems/Spock/dwf.slurm
@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 1
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap"
 srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Spock/dwf4.slurm
+++ b/systems/Spock/dwf4.slurm
@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 4
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Spock/dwf8.slurm
+++ b/systems/Spock/dwf8.slurm
@ -0,0 +1,26 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 2
 #SBATCH -n 8
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=8
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Spock/mpiwrapper.sh
+++ b/systems/Spock/mpiwrapper.sh
@ -0,0 +1,12 @@
 #!/bin/bash
 lrank=$SLURM_LOCALID
 export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID
 echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING"
 $*
--- a/systems/Spock/sourceme.sh
+++ b/systems/Spock/sourceme.sh
@ -0,0 +1,5 @@
 module load PrgEnv-gnu
 module load rocm/4.3.0
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx908
--- a/systems/Summit/comms.4node
+++ b/systems/Summit/comms.4node
@ -0,0 +1,179 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200060000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 1073741824 byte stencil comms buffers 
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 Grid : Message : MemoryManager Cache 13529146982 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 2.137929 s : Grid is setup to use 6 threads
 Grid : Message : 2.137941 s : Number of iterations to average: 250
 Grid : Message : 2.137950 s : ====================================================================================================
 Grid : Message : 2.137958 s : = Benchmarking sequential halo exchange from host memory 
 Grid : Message : 2.137966 s : ====================================================================================================
 Grid : Message : 2.137974 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.604949 s :    8	8	     393216       89973.9  		179947.8
 Grid : Message : 2.668249 s :    8	8	     393216       18650.3  		37300.5
 Grid : Message : 2.732288 s :    8	8	     393216       18428.5  		36857.1
 Grid : Message : 2.753565 s :    8	8	     393216       55497.2  		110994.4
 Grid : Message : 2.808960 s :   12	8	    1327104       100181.5  		200363.0
 Grid : Message : 3.226900 s :   12	8	    1327104       20600.5  		41201.0
 Grid : Message : 3.167459 s :   12	8	    1327104       24104.6  		48209.2
 Grid : Message : 3.227660 s :   12	8	    1327104       66156.7  		132313.5
 Grid : Message : 3.413570 s :   16	8	    3145728       56174.4  		112348.8
 Grid : Message : 3.802697 s :   16	8	    3145728       24255.9  		48511.7
 Grid : Message : 4.190498 s :   16	8	    3145728       24336.7  		48673.4
 Grid : Message : 4.385171 s :   16	8	    3145728       48484.1  		96968.2
 Grid : Message : 4.805284 s :   20	8	    6144000       46380.5  		92761.1
 Grid : Message : 5.562975 s :   20	8	    6144000       24328.5  		48656.9
 Grid : Message : 6.322562 s :   20	8	    6144000       24266.7  		48533.4
 Grid : Message : 6.773598 s :   20	8	    6144000       40868.5  		81736.9
 Grid : Message : 7.600999 s :   24	8	   10616832       40198.3  		80396.6
 Grid : Message : 8.912917 s :   24	8	   10616832       24279.5  		48559.1
 Grid : Message : 10.220961 s :   24	8	   10616832       24350.2  		48700.4
 Grid : Message : 11.728250 s :   24	8	   10616832       37390.9  		74781.8
 Grid : Message : 12.497258 s :   28	8	   16859136       36792.2  		73584.5
 Grid : Message : 14.585387 s :   28	8	   16859136       24222.2  		48444.3
 Grid : Message : 16.664783 s :   28	8	   16859136       24323.4  		48646.8
 Grid : Message : 17.955238 s :   28	8	   16859136       39194.7  		78389.4
 Grid : Message : 20.136479 s :   32	8	   25165824       35718.3  		71436.5
 Grid : Message : 23.241958 s :   32	8	   25165824       24311.4  		48622.9
 Grid : Message : 26.344810 s :   32	8	   25165824       24331.9  		48663.7
 Grid : Message : 28.384420 s :   32	8	   25165824       37016.3  		74032.7
 Grid : Message : 28.388879 s : ====================================================================================================
 Grid : Message : 28.388894 s : = Benchmarking sequential halo exchange from GPU memory 
 Grid : Message : 28.388909 s : ====================================================================================================
 Grid : Message : 28.388924 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 Grid : Message : 28.553993 s :    8	8	     393216       8272.4  		16544.7
 Grid : Message : 28.679592 s :    8	8	     393216       9395.4  		18790.8
 Grid : Message : 28.811112 s :    8	8	     393216       8971.0  		17942.0
 Grid : Message : 28.843770 s :    8	8	     393216       36145.6  		72291.2
 Grid : Message : 28.981754 s :   12	8	    1327104       49591.6  		99183.2
 Grid : Message : 29.299764 s :   12	8	    1327104       12520.8  		25041.7
 Grid : Message : 29.620288 s :   12	8	    1327104       12422.2  		24844.4
 Grid : Message : 29.657645 s :   12	8	    1327104       106637.5  		213275.1
 Grid : Message : 29.952933 s :   16	8	    3145728       43939.2  		87878.5
 Grid : Message : 30.585411 s :   16	8	    3145728       14922.1  		29844.2
 Grid : Message : 31.219781 s :   16	8	    3145728       14877.2  		29754.4
 Grid : Message : 31.285017 s :   16	8	    3145728       144724.3  		289448.7
 Grid : Message : 31.706443 s :   20	8	    6144000       54676.2  		109352.4
 Grid : Message : 32.739205 s :   20	8	    6144000       17848.0  		35696.1
 Grid : Message : 33.771852 s :   20	8	    6144000       17849.9  		35699.7
 Grid : Message : 33.871981 s :   20	8	    6144000       184141.4  		368282.8
 Grid : Message : 34.536808 s :   24	8	   10616832       55784.3  		111568.6
 Grid : Message : 36.275648 s :   24	8	   10616832       18317.6  		36635.3
 Grid : Message : 37.997181 s :   24	8	   10616832       18501.7  		37003.4
 Grid : Message : 38.140442 s :   24	8	   10616832       222383.9  		444767.9
 Grid : Message : 39.177222 s :   28	8	   16859136       56609.7  		113219.4
 Grid : Message : 41.874755 s :   28	8	   16859136       18749.9  		37499.8
 Grid : Message : 44.529381 s :   28	8	   16859136       19052.9  		38105.8
 Grid : Message : 44.742192 s :   28	8	   16859136       237717.1  		475434.2
 Grid : Message : 46.184000 s :   32	8	   25165824       57091.2  		114182.4
 Grid : Message : 50.734740 s :   32	8	   25165824       19411.0  		38821.9
 Grid : Message : 53.931228 s :   32	8	   25165824       19570.6  		39141.2
 Grid : Message : 54.238467 s :   32	8	   25165824       245765.6  		491531.2
 Grid : Message : 54.268664 s : ====================================================================================================
 Grid : Message : 54.268680 s : = All done; Bye Bye
 Grid : Message : 54.268691 s : ====================================================================================================
--- a/systems/Summit/config-command
+++ b/systems/Summit/config-command
@ -0,0 +1,14 @@
 ../../configure --enable-comms=mpi \
 	      --enable-simd=GPU \
 	      --enable-gen-simd-width=32 \
 	      --enable-unified=no \
 	       --enable-shm=nvlink \
 	       --disable-gparity \
 	       --enable-setdevice \
 	       --disable-fermion-reps \
 	       --enable-accelerator=cuda \
 	       --prefix /ccs/home/paboyle/prefix \
 	       CXX=nvcc \
 	       LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
 	       CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"
--- a/systems/Summit/dwf.24.4node
+++ b/systems/Summit/dwf.24.4node
@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.731905 s : Grid Layout
 Grid : Message : 1.731915 s : 	Global lattice size  : 48 48 48 72 
 Grid : Message : 1.731928 s : 	OpenMP threads       : 6
 Grid : Message : 1.731938 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.683494 s : Making s innermost grids
 Grid : Message : 2.780034 s : Initialising 4d RNG
 Grid : Message : 2.833099 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.833121 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.916841 s : Initialising 5d RNG
 Grid : Message : 3.762880 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 3.762902 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 5.264345 s : Initialised RNGs
 Grid : Message : 6.489904 s : Drawing gauge field
 Grid : Message : 6.729262 s : Random gauge initialised 
 Grid : Message : 7.781273 s : Setting up Cshift based reference 
 Grid : Message : 8.725313 s : *****************************************************************
 Grid : Message : 8.725332 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 8.725342 s : *****************************************************************
 Grid : Message : 8.725352 s : *****************************************************************
 Grid : Message : 8.725362 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 8.725372 s : * Vectorising space-time by 4
 Grid : Message : 8.725383 s : * VComplexF size is 32 B
 Grid : Message : 8.725395 s : * SINGLE precision 
 Grid : Message : 8.725405 s : * Using Overlapped Comms/Compute
 Grid : Message : 8.725415 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 8.725425 s : *****************************************************************
 Grid : Message : 9.465229 s : Called warmup
 Grid : Message : 58.646066 s : Called Dw 3000 times in 4.91764e+07 us
 Grid : Message : 58.646121 s : mflop/s =   1.02592e+07
 Grid : Message : 58.646134 s : mflop/s per rank =  427468
 Grid : Message : 58.646145 s : mflop/s per node =  2.56481e+06
 Grid : Message : 58.646156 s : RF  GiB/s (base 2) =   20846.5
 Grid : Message : 58.646166 s : mem GiB/s (base 2) =   13029.1
 Grid : Message : 58.648008 s : norm diff   1.04778e-13
 Grid : Message : 58.734885 s : #### Dhop calls report 
 Grid : Message : 58.734897 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 58.734909 s : WilsonFermion5D TotalTime   /Calls        : 8217.71 us
 Grid : Message : 58.734922 s : WilsonFermion5D CommTime    /Calls        : 7109.5 us
 Grid : Message : 58.734933 s : WilsonFermion5D FaceTime    /Calls        : 446.623 us
 Grid : Message : 58.734943 s : WilsonFermion5D ComputeTime1/Calls        : 18.0558 us
 Grid : Message : 58.734953 s : WilsonFermion5D ComputeTime2/Calls        : 731.097 us
 Grid : Message : 58.734979 s : Average mflops/s per call                : 4.8157e+09
 Grid : Message : 58.734989 s : Average mflops/s per call per rank       : 2.00654e+08
 Grid : Message : 58.734999 s : Average mflops/s per call per node       : 1.20393e+09
 Grid : Message : 58.735008 s : Average mflops/s per call (full)         : 1.04183e+07
 Grid : Message : 58.735017 s : Average mflops/s per call per rank (full): 434094
 Grid : Message : 58.735026 s : Average mflops/s per call per node (full): 2.60456e+06
 Grid : Message : 58.735035 s : WilsonFermion5D Stencil
 Grid : Message : 58.735043 s : WilsonFermion5D StencilEven
 Grid : Message : 58.735051 s : WilsonFermion5D StencilOdd
 Grid : Message : 58.735059 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 58.735067 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 58.735075 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 64.934380 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 64.934740 s : Called DwDag
 Grid : Message : 64.934870 s : norm dag result 12.0422
 Grid : Message : 64.120756 s : norm dag ref    12.0422
 Grid : Message : 64.149389 s : norm dag diff   7.6644e-14
 Grid : Message : 64.317786 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 64.465331 s : src_e0.499995
 Grid : Message : 64.524653 s : src_o0.500005
 Grid : Message : 64.558706 s : *********************************************************
 Grid : Message : 64.558717 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 64.558727 s : * Vectorising space-time by 4
 Grid : Message : 64.558737 s : * SINGLE precision 
 Grid : Message : 64.558745 s : * Using Overlapped Comms/Compute
 Grid : Message : 64.558753 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 64.558761 s : *********************************************************
 Grid : Message : 92.702145 s : Deo mflop/s =   8.97692e+06
 Grid : Message : 92.702185 s : Deo mflop/s per rank   374038
 Grid : Message : 92.702198 s : Deo mflop/s per node   2.24423e+06
 Grid : Message : 92.702209 s : #### Dhop calls report 
 Grid : Message : 92.702223 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 92.702240 s : WilsonFermion5D TotalTime   /Calls        : 9377.88 us
 Grid : Message : 92.702257 s : WilsonFermion5D CommTime    /Calls        : 8221.84 us
 Grid : Message : 92.702277 s : WilsonFermion5D FaceTime    /Calls        : 543.548 us
 Grid : Message : 92.702301 s : WilsonFermion5D ComputeTime1/Calls        : 20.936 us
 Grid : Message : 92.702322 s : WilsonFermion5D ComputeTime2/Calls        : 732.33 us
 Grid : Message : 92.702376 s : Average mflops/s per call                : 4.13001e+09
 Grid : Message : 92.702387 s : Average mflops/s per call per rank       : 1.72084e+08
 Grid : Message : 92.702397 s : Average mflops/s per call per node       : 1.0325e+09
 Grid : Message : 92.702407 s : Average mflops/s per call (full)         : 9.12937e+06
 Grid : Message : 92.702416 s : Average mflops/s per call per rank (full): 380391
 Grid : Message : 92.702426 s : Average mflops/s per call per node (full): 2.28234e+06
 Grid : Message : 92.702435 s : WilsonFermion5D Stencil
 Grid : Message : 92.702443 s : WilsonFermion5D StencilEven
 Grid : Message : 92.702451 s : WilsonFermion5D StencilOdd
 Grid : Message : 92.702459 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 92.702467 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 92.702475 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 92.772983 s : r_e6.02121
 Grid : Message : 92.786384 s : r_o6.02102
 Grid : Message : 92.799622 s : res12.0422
 Grid : Message : 93.860500 s : norm diff   0
 Grid : Message : 93.162026 s : norm diff even  0
 Grid : Message : 93.197529 s : norm diff odd   0
--- a/systems/Summit/dwf.32.4node
+++ b/systems/Summit/dwf.32.4node
@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.544984 s : Grid Layout
 Grid : Message : 1.544992 s : 	Global lattice size  : 64 64 64 96 
 Grid : Message : 1.545003 s : 	OpenMP threads       : 6
 Grid : Message : 1.545011 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.994920 s : Making s innermost grids
 Grid : Message : 2.232502 s : Initialising 4d RNG
 Grid : Message : 2.397047 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.397069 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.653140 s : Initialising 5d RNG
 Grid : Message : 5.285347 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 5.285369 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 9.994738 s : Initialised RNGs
 Grid : Message : 13.153426 s : Drawing gauge field
 Grid : Message : 13.825697 s : Random gauge initialised 
 Grid : Message : 18.537657 s : Setting up Cshift based reference 
 Grid : Message : 22.296755 s : *****************************************************************
 Grid : Message : 22.296781 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 22.296791 s : *****************************************************************
 Grid : Message : 22.296800 s : *****************************************************************
 Grid : Message : 22.296809 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 22.296818 s : * Vectorising space-time by 4
 Grid : Message : 22.296828 s : * VComplexF size is 32 B
 Grid : Message : 22.296838 s : * SINGLE precision 
 Grid : Message : 22.296847 s : * Using Overlapped Comms/Compute
 Grid : Message : 22.296855 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 22.296863 s : *****************************************************************
 Grid : Message : 24.746452 s : Called warmup
 Grid : Message : 137.525756 s : Called Dw 3000 times in 1.12779e+08 us
 Grid : Message : 137.525818 s : mflop/s =   1.41383e+07
 Grid : Message : 137.525831 s : mflop/s per rank =  589097
 Grid : Message : 137.525843 s : mflop/s per node =  3.53458e+06
 Grid : Message : 137.525854 s : RF  GiB/s (base 2) =   28728.7
 Grid : Message : 137.525864 s : mem GiB/s (base 2) =   17955.5
 Grid : Message : 137.693645 s : norm diff   1.04885e-13
 Grid : Message : 137.965585 s : #### Dhop calls report 
 Grid : Message : 137.965598 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 137.965612 s : WilsonFermion5D TotalTime   /Calls        : 18899.7 us
 Grid : Message : 137.965624 s : WilsonFermion5D CommTime    /Calls        : 16041.4 us
 Grid : Message : 137.965634 s : WilsonFermion5D FaceTime    /Calls        : 859.705 us
 Grid : Message : 137.965644 s : WilsonFermion5D ComputeTime1/Calls        : 70.5881 us
 Grid : Message : 137.965654 s : WilsonFermion5D ComputeTime2/Calls        : 2094.8 us
 Grid : Message : 137.965682 s : Average mflops/s per call                : 3.87638e+09
 Grid : Message : 137.965692 s : Average mflops/s per call per rank       : 1.61516e+08
 Grid : Message : 137.965702 s : Average mflops/s per call per node       : 9.69095e+08
 Grid : Message : 137.965712 s : Average mflops/s per call (full)         : 1.43168e+07
 Grid : Message : 137.965721 s : Average mflops/s per call per rank (full): 596533
 Grid : Message : 137.965730 s : Average mflops/s per call per node (full): 3.5792e+06
 Grid : Message : 137.965740 s : WilsonFermion5D Stencil
 Grid : Message : 137.965748 s : WilsonFermion5D StencilEven
 Grid : Message : 137.965756 s : WilsonFermion5D StencilOdd
 Grid : Message : 137.965764 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 137.965772 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 137.965780 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 156.554605 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 156.554632 s : Called DwDag
 Grid : Message : 156.554642 s : norm dag result 12.0421
 Grid : Message : 156.639265 s : norm dag ref    12.0421
 Grid : Message : 156.888281 s : norm dag diff   7.62057e-14
 Grid : Message : 157.609797 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 158.208630 s : src_e0.499996
 Grid : Message : 158.162447 s : src_o0.500004
 Grid : Message : 158.267780 s : *********************************************************
 Grid : Message : 158.267791 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 158.267801 s : * Vectorising space-time by 4
 Grid : Message : 158.267811 s : * SINGLE precision 
 Grid : Message : 158.267820 s : * Using Overlapped Comms/Compute
 Grid : Message : 158.267828 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 158.267836 s : *********************************************************
 Grid : Message : 216.487829 s : Deo mflop/s =   1.37283e+07
 Grid : Message : 216.487869 s : Deo mflop/s per rank   572011
 Grid : Message : 216.487881 s : Deo mflop/s per node   3.43206e+06
 Grid : Message : 216.487893 s : #### Dhop calls report 
 Grid : Message : 216.487903 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 216.487913 s : WilsonFermion5D TotalTime   /Calls        : 19399.6 us
 Grid : Message : 216.487923 s : WilsonFermion5D CommTime    /Calls        : 16475.4 us
 Grid : Message : 216.487933 s : WilsonFermion5D FaceTime    /Calls        : 972.393 us
 Grid : Message : 216.487943 s : WilsonFermion5D ComputeTime1/Calls        : 49.8474 us
 Grid : Message : 216.487953 s : WilsonFermion5D ComputeTime2/Calls        : 2089.93 us
 Grid : Message : 216.488001 s : Average mflops/s per call                : 5.39682e+09
 Grid : Message : 216.488011 s : Average mflops/s per call per rank       : 2.24867e+08
 Grid : Message : 216.488020 s : Average mflops/s per call per node       : 1.3492e+09
 Grid : Message : 216.488030 s : Average mflops/s per call (full)         : 1.39479e+07
 Grid : Message : 216.488039 s : Average mflops/s per call per rank (full): 581162
 Grid : Message : 216.488048 s : Average mflops/s per call per node (full): 3.48697e+06
 Grid : Message : 216.488057 s : WilsonFermion5D Stencil
 Grid : Message : 216.488065 s : WilsonFermion5D StencilEven
 Grid : Message : 216.488073 s : WilsonFermion5D StencilOdd
 Grid : Message : 216.488081 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 216.488089 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 216.488097 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 217.384495 s : r_e6.02113
 Grid : Message : 217.426121 s : r_o6.02096
 Grid : Message : 217.472636 s : res12.0421
 Grid : Message : 218.200068 s : norm diff   0
 Grid : Message : 218.645673 s : norm diff even  0
 Grid : Message : 218.816561 s : norm diff odd   0
--- a/systems/Summit/dwf16.lsf
+++ b/systems/Summit/dwf16.lsf
@ -0,0 +1,25 @@
 #!/bin/bash
 #BSUB -P LGT104
 #BSUB -W 2:00
 #BSUB -nnodes 16
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
 APP="./benchmarks/Benchmark_comms_host_device  --mpi 4.4.4.3 "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log
--- a/systems/Summit/dwf4.lsf
+++ b/systems/Summit/dwf4.lsf
@ -0,0 +1,25 @@
 #!/bin/bash
 #BSUB -P LGT104
 #BSUB -W 2:00
 #BSUB -nnodes 4
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
 #export GRID_ALLOC_NCACHE_LARGE=1
 export APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.4node
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.24.4node
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.32.4node
--- a/systems/Summit/sourceme-cuda10.sh
+++ b/systems/Summit/sourceme-cuda10.sh
@ -0,0 +1,8 @@
 export UCX_GDR_COPY_RCACHE=no
 export UCX_MEMTYPE_CACHE=n
 export UCX_RNDV_SCHEME=put_zcopy
 module load gcc/7.5.0
 module load cuda/10.2.89
 #cuda/11.4.0
 export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH 
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	4de50ab146	Merge pull request #396 from fjosw/fix/readd_config.h fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am	2022-05-09 08:26:48 -04:00
Fabian Joswig	8b12a61097	fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am	2022-05-09 11:53:22 +01:00
Peter Boyle	79ea027c0b	Merge pull request #377 from RJHudspith/develop NERSC and ILDG for non-SU(3) configuration checkpoints	2022-05-03 08:55:48 -04:00
Peter Boyle	62339d437f	Merge pull request #387 from lehner/feature/gpt Parity mass terms for domain wall fermions to enable 4d eofa	2022-05-03 08:52:18 -04:00
Peter Boyle	698e745276	Merge pull request #390 from fjosw/feature/conserved_current_wilson Conserved current for wilson fermions	2022-05-03 08:51:10 -04:00
Peter Boyle	9a6e2c315d	Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge SteepestDescentGaugeFix now exits when the algorithm does not converge.	2022-05-03 08:49:26 -04:00
Fabian Joswig	e61fed87db	SteepestDescentGaugeFix now exits when the algorithm does not converge. This behaviour can be altered by setting err_on_no_converge to false.	2022-04-20 15:41:55 +01:00
Fabian Joswig	b8bc560b51	Test_wilson_conserved_current implemented, all 5d references removed.	2022-04-05 17:33:45 +01:00
Fabian Joswig	6bc2483d57	Merge branch 'feature/eclover' into feature/conserved_current_wilson	2022-04-05 15:26:49 +01:00
Fabian Joswig	82aecbf4cf	Test_wilson_conserved_current added	2022-04-05 15:26:39 +01:00
Fabian Joswig	d7191e5a02	SeqConservedCurrent implemented for Wilson fermions	2022-04-05 11:48:56 +01:00
Fabian Joswig	c8a824425b	Error message added if another conserved current than vector is requested for Wilson type fermions.	2022-04-05 10:58:22 +01:00
Fabian Joswig	427c8695fe	Change signs and prefactors for conserved current to mimic the 5d version.	2022-04-01 16:20:21 +01:00
Fabian Joswig	603fd96747	Missing link multiplication added.	2022-04-01 10:58:56 +01:00
Fabian Joswig	fe993c0836	/=2 replaced by *=0.5	2022-03-31 17:08:17 +01:00
Fabian Joswig	cdf31d52c1	GaugeGrid and typo fixed	2022-03-31 17:04:35 +01:00
Fabian Joswig	0542eaf1da	First version of conserved current contraction for Wilson type quarks	2022-03-31 17:02:09 +01:00
Christoph Lehner	317bdcf158	nerscio parametrization	2022-03-24 13:10:47 +01:00
Peter Boyle	605cf401e1	Merge branch 'feature/sumd-npr' into develop	2022-03-16 22:43:12 +00:00
Peter Boyle	f99c3660d2	Merge branch 'feature/cpu-threaded-smp' into develop	2022-03-16 22:07:54 +00:00
Peter Boyle	92a83a9eb3	Performance improve for Tesseract	2022-03-16 17:14:36 +00:00
Peter Boyle	b615fa0f35	Merge pull request #388 from fjosw/feature/sumd-npr Feature/sumd npr	2022-03-15 09:05:57 -04:00
Christoph Lehner	76c294a7ba	open bc fix	2022-03-08 13:55:16 +01:00
Fabian Joswig	0c0c2b1e20	Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed.	2022-03-08 09:44:51 +00:00
Christoph Lehner	e2fc3a0f04	Merge pull request #28 from paboyle/develop Sync with Upstream	2022-03-08 09:58:51 +01:00
Fabian Joswig	451e7972fd	Reintroduced explicit inversion of the Clover term in case of the CompactExpClover because of the open boundary O(a) improvement. Changed the timing output to GridLogDebug	2022-03-07 17:43:33 +00:00
Fabian Joswig	56c089d347	Removed leftover comments	2022-03-07 16:40:20 +00:00
Fabian Joswig	acf740e44d	Merge pull request #1 from FelixPGZiegler/feature/eclover Feature/eclover	2022-03-07 16:25:11 +00:00
Felix Ziegler	182f513404	Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover	2022-03-07 15:22:04 +00:00
Felix Ziegler	d5b2323a57	included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover	2022-03-07 14:44:24 +00:00
FelixPGZiegler	bad18d4417	Merge branch 'paboyle:develop' into feature/eclover	2022-03-07 13:54:10 +00:00
Fabian Joswig	d1decee4cc	Cleaned up unused variables in Lattice_reduction_gpu.h	2022-03-02 16:54:23 +00:00
Fabian Joswig	d4ae71b880	sum_gpu_large and sum_gpu templates added.	2022-03-02 15:40:18 +00:00
Peter Boyle	e16fc5b2e4	Threaded intranode comms transfer - ideally between NUMA domains	2022-03-01 11:17:24 -05:00
Peter Boyle	694306f202	Configure for mac arm	2022-03-01 10:53:44 -05:00
Peter Boyle	9aac1e6d64	Merge branch 'develop' into feature/sumd-npr	2022-03-01 10:51:38 -05:00
Peter Boyle	3e882f555d	Large / small sumD options	2022-03-01 08:54:45 -05:00
Fabian Joswig	438caab25f	generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed.	2022-02-27 18:27:18 +00:00
Fabian Joswig	239e2c1ee6	tests: wilson clover cg tests now include compact variant as well as exponential wilson clover operators	2022-02-27 18:26:34 +00:00
Fabian Joswig	013dc2ef33	tests: core tests for wilson clover and wilson exp clover including compact version extended/added	2022-02-27 18:13:47 +00:00
Christoph Lehner	9616811c3d	Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt	2022-02-24 22:03:05 +01:00
Christoph Lehner	8a3002c03b	separate left and right masses for CayleyFermion5D	2022-02-24 22:02:56 +01:00
Mattia Bruno	71034f828e	attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig	2022-02-23 01:02:27 +01:00
Mattia Bruno	11437930c5	cleaned up definitions of wilsonclover fermions	2022-02-22 10:45:16 +01:00
Mattia Bruno	3d44aa9cb9	cleaned up cloverhelpers; fixed test compact_clover which runs	2022-02-22 01:10:19 +01:00
Mattia Bruno	2851870d70	expClover support via helpers template class	2022-02-22 00:05:43 +01:00
Peter Boyle	63dbaeefaa	Extra barrier prior to finalize just in case it fixes an issue on Tursa	2022-02-16 14:01:43 +00:00
Peter Boyle	e8c187b323	SyCL happier?	2022-02-15 11:24:38 -05:00
Peter Boyle	0c1618197f	Faster intranode MPI works now	2022-02-15 08:52:07 -05:00
Peter Boyle	f49d5c2d22	Updated scripts for crusher	2022-02-14 17:55:16 -05:00
Peter Boyle	a3b022d469	Crusher compile	2022-02-14 15:09:08 -05:00
Peter Boyle	48772f0976	Merge pull request #384 from jdmaia/hip_launchbounds Changing thread block order and adding launch_bounds	2022-02-14 11:08:28 -05:00
Peter Boyle	c322420580	Dont instantiate an Nc=3 and non-GP hardwired code for other implementations	2022-02-14 16:04:08 +00:00
Julio Maia	86f4e17928	Changing thread block order and adding launch_bounds	2022-02-07 11:29:37 -06:00
Peter Boyle	215df671be	Merge pull request #382 from DanielRichtmann/feature/compact-clover Compact Clover Fermions	2022-02-01 21:45:38 -05:00
Daniel Richtmann	1b6b12589f	Get splitting up into implementation and instantiation files correct	2022-02-02 00:51:11 +01:00
Daniel Richtmann	3082ab8252	Check in compact version of wilson clover fermions	2022-02-02 00:50:05 +01:00
Daniel Richtmann	add86cd7f4	Abandon ET for clover application, use construct similar to multLink	2022-02-01 23:09:06 +01:00
Daniel Richtmann	0b6fd20c54	Enable memory coalescing in clover term generation	2022-02-01 23:09:06 +01:00
Daniel Richtmann	e83423fee6	Refactor clover to align with other files and prepare for upcoming changes	2022-02-01 23:09:06 +01:00
Daniel Richtmann	b4f8e87982	Have Grid's cli interface understand floats	2022-02-01 23:09:06 +01:00
Peter Boyle	135808dcfa	Less verbose	2021-12-07 16:24:24 -05:00
Peter Boyle	7f7d06d963	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-12-07 09:06:42 -08:00
Peter Boyle	2bf3b4d576	Update to reduce memory footpring in benchmark test	2021-12-07 09:02:02 -08:00
RJHudspith	0bd83cdbda	Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled.	2021-11-28 21:51:03 +01:00
Peter Boyle	f34d34bd17	2 nodes	2021-11-22 22:27:16 -05:00
Peter Boyle	e32d5141b4	Updated to make MPI reliable still gives good perf, but MPI will be slow intranode	2021-11-22 21:46:31 -05:00
Peter Boyle	6d5277f2d7	Update to Spock	2021-11-22 20:58:02 -05:00
Peter Boyle	14d82777e0	Best modules for spock	2021-11-22 20:47:16 -05:00
Peter Boyle	2a4e739513	Enable XGMI copy (need to rename nvlink to cover NVLINK/XGMI/XeLink)	2021-11-22 20:46:09 -05:00
Peter Boyle	8079dc2a14	Cray MPI not working right yet	2021-11-22 20:45:44 -05:00
Peter Boyle	6ceb556684	Intranode asynch hipMemCopy	2021-11-22 20:45:12 -05:00
Peter Boyle	76cde73705	HIP improvements on messaging and intranode hipMemCopyAsynch	2021-11-22 20:44:39 -05:00
Peter Boyle	cc094366a9	Merge pull request #375 from JPRichings/develop Lattice object ACCcache probe	2021-11-09 18:19:32 -05:00
James Richings	41a575ff9b	Format edit	2021-11-09 21:56:23 +00:00
James Richings	12ef413065	fix to deflation.h	2021-11-09 21:20:36 +00:00
James Richings	829a328451	remove deflation timing	2021-11-09 20:46:57 +00:00
James Richings	402523c62e	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-11-09 12:57:40 +00:00
James Richings	d7bef70b5c	Helper functions to allow probe of cache state of lattice objects.	2021-11-09 12:57:09 +00:00
James Richings	2ad1811642	Added timing to deflation code.	2021-11-09 12:33:25 +00:00
Antonin Portelli	a65a497bae	Merge branch 'develop' of github.com:paboyle/Grid into develop	2021-10-29 13:01:34 +01:00
Antonin Portelli	b27b12828e	reverse previous "fix", missing statement was probably intentional, added a comment to that effect	2021-10-29 13:01:31 +01:00
Peter Boyle	42d56ea6b6	Verbosity	2021-10-29 02:23:08 +01:00
Peter Boyle	0b905a72dd	Better reduction for GPUs	2021-10-29 02:22:22 +01:00
Peter Boyle	fe9edf8526	Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop	2021-10-29 02:03:27 +01:00
Peter Boyle	44204c7e06	Extra code	2021-10-29 02:02:56 +01:00
Peter Boyle	33b3789598	Merge pull request #364 from AndrewYongZhenNing/develop CayleyFermion5D Conserved current fix	2021-10-27 20:27:20 -04:00
Peter Boyle	195ab2888d	Merge branch 'develop' into develop	2021-10-27 20:26:57 -04:00
Peter Boyle	85f750d753	Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop	2021-10-27 00:28:05 +01:00
Peter Boyle	a4ce6e42c7	Warning free compile on make all and make tests under nvcc	2021-10-27 00:27:03 +01:00
Peter Boyle	5398b7e7e3	Max 128 size	2021-10-26 09:16:29 -07:00
James Richings	fd13a3f2be	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-10-26 10:45:46 +01:00
James Richings	c144b32368	deflation timers	2021-10-26 10:37:24 +01:00
Peter Boyle	ba7e371b90	Warning free compile on Tursa. Hopefully got all reqd virtual dtors	2021-10-21 19:56:52 +01:00
Peter Boyle	99e7a5d18a	Merge pull request #371 from edbennett/hmc-documentation-update update documentation for GenericHMCRunner - thanks	2021-10-18 14:36:43 -04:00
Ed Bennett	f824d99059	update documentation for GenericHMCRunner	2021-10-18 09:50:16 +01:00
Peter Boyle	749b8022a4	Linear operator and SparseMatrix virtual destructors	2021-10-15 20:47:18 +01:00
Peter Boyle	7e0057d2c4	Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop	2021-10-15 20:46:51 +01:00
Peter Boyle	cfe9e870d3	Stream	2021-10-15 20:46:44 +01:00
Peter Boyle	e9c4f06cbf	Merge pull request #370 from fjosw/bugfix/gpu_sum_shm Error Handling sum_Dgpu large objects	2021-10-14 09:12:47 -04:00
Fabian Joswig	1f9688417a	Error message added when attempting to sum object which is too large for the shared memory	2021-10-13 20:45:46 +01:00
Peter Boyle	16c2a99965	Overlap cudamemcpy - didn't set up stream right	2021-10-11 13:31:26 -07:00
Peter Boyle	cda915a345	Better options	2021-10-07 20:29:09 +01:00
Peter Boyle	7c16189e16	Merge pull request #368 from Heinrich-BR/develop Accelerated Pick-Set Checkerboard functions	2021-10-07 15:13:09 -04:00
Peter Boyle	ecbfccea43	Merge pull request #369 from paboyle/gauge-group-covariance expose gauge group in GImpl and generic Nc fix	2021-10-07 15:11:12 -04:00
Peter Boyle	a8eda8f6da	Summit scripts	2021-10-05 21:22:10 -04:00
Peter Boyle	9b1a0653cf	Summit results	2021-10-05 21:22:01 -04:00
Peter Boyle	7cb1ff7395	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-10-05 20:13:42 -04:00
Peter Boyle	ab6ea29913	Print removal	2021-10-05 20:13:25 -04:00
Antonin Portelli	b5c81a02b6	Merge branch 'develop' of github.com:paboyle/Grid into develop	2021-10-05 21:13:01 +01:00
Antonin Portelli	d899ee80fc	skip record fixed to include norm metadata	2021-10-05 21:12:47 +01:00
Peter Boyle	4016e705fc	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-10-05 14:56:57 -04:00
Peter Boyle	2f4e85e5d6	Summit set up	2021-10-05 14:56:17 -04:00
Peter Boyle	8ed0b57b09	Memory verbose and tracking, shrink default cache Print PCI device IDs on node 0	2021-10-05 11:41:03 -04:00
Henrique B.R	7e130076d6	Fixed line left behind	2021-09-24 17:26:31 +01:00
Henrique B.R	6efdad6f21	Removed Halo benchmark	2021-09-24 17:18:04 +01:00
Henrique B.R	a822c48565	Added accelerated pick-set checkerboard functions	2021-09-24 17:13:25 +01:00
Henrique B.R	014fb76e88	Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop	2021-09-24 16:45:25 +01:00
Henrique B.R	30e5311b43	Update from the gods upstream	2021-09-24 16:39:56 +01:00
Henrique Rocha	11ee8a1061	Merge remote-tracking branch 'upstream/develop' into develop	2021-09-02 16:57:42 +01:00
Andrew Yong	770680669d	Whitespace removal.	2021-08-04 09:21:59 +01:00
Andrew Yong	0cdfc5cf22	Merge remote-tracking branch 'upstream/develop' into develop	2021-07-30 14:40:55 +01:00
Henrique B.R	428b8ba907	Updated from upstream and added halo benchmark	2021-06-29 01:05:12 +01:00
Andrew Zhen Ning Yong	54c6b1376d	Quick fix of conserved current implementation in CayleyFermion5D. Now function treats current insertion with appropriate periodic boundary conditions in the mu=3 direction.	2021-04-21 16:56:46 +01:00
Andrew Zhen Ning Yong	f3f11b586f	Tadpole sign now in front of forward hopping term to be consistent with previous implementation and analytic form.	2021-04-17 12:44:27 +01:00
Andrew Zhen Ning Yong	8083e3f7e8	Sign factor for tadpole implementation corrected.	2021-04-15 11:14:31 +01:00
Henrique B.R	364793154b	Reverted checkerboard changes	2021-04-09 15:47:17 +01:00
Henrique B.R	3e2ae1e9af	Added profiling messages to pick and set checkerboard functions	2021-04-08 16:58:47 +01:00
Henrique Rocha	d38ae2fd18	Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop	2021-04-06 17:18:39 +01:00
Henrique Rocha	030e7754e4	Merge remote-tracking branch 'upstream/develop' into develop	2021-04-06 17:16:13 +01:00
Henrique B.R	3b7fce1e76	Reverted checkerboard changes	2021-04-02 14:38:41 +01:00
Henrique B.R	4d15417f93	Merge remote-tracking branch 'upstream/develop' into develop	2021-04-01 18:28:15 +01:00
Henrique B.R	ab3c855f65	Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop	2021-04-01 18:22:05 +01:00
Henrique B.R	92e2c517d8	Changed pick- and setCheckerboard to use accelerator_for	2021-04-01 18:21:19 +01:00
		`@ -0,0 +1 @@`
							`../CompactWilsonCloverFermionInstantiation.cc.master`