Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-12-16 02:34:41 +00:00 · 2021-12-07 09:06:42 -08:00
parent 2bf3b4d576 f34d34bd17
commit 7f7d06d963
52 changed files with 850 additions and 88 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
      
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){

 	SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;

    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];

 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -508,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@@ -586,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
      
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -599,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
      
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+template<class Field> using Preconditioner =  LinearFunction<Field> ;
+
+/*
+template<class Field> class Preconditioner :  public LinearFunction<Field> {
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
+*/

 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
+  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:                                                
+  public:
+    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -67,6 +67,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -97,6 +98,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
+    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;

@@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
    
@@ -170,7 +171,7 @@ public:

    LinalgTimer.Start();

-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);

    //p[0],q[0],qq[0] 
@@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);

      LinalgTimer.Start();
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -170,6 +170,7 @@ private:

 public:
  static void Print(void);
+  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -474,6 +474,32 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
  }
 }

+void MemoryManager::PrintState(void* _CpuPtr)
+{
+  uint64_t CpuPtr = (uint64_t)_CpuPtr;
+
+  if ( EntryPresent(CpuPtr) ){
+    auto AccCacheIterator = EntryLookup(CpuPtr);
+    auto & AccCache = AccCacheIterator->second;
+    std::string str;
+    if ( AccCache.state==Empty    ) str = std::string("Empty");
+    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
+    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
+    if ( AccCache.state==Consistent)str = std::string("Consistent");
+    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
+
+    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
+    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
+    << "\t" << AccCache.cpuLock
+    << "\t" << AccCache.accLock
+    << "\t" << AccCache.LRU_valid<<std::endl;
+
+  } else {
+    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
+  }
+}
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -16,6 +16,10 @@ uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
+void  MemoryManager::PrintState(void* CpuPtr)
+{
+std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
+};
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};

--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -88,6 +88,13 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
+
+  // Helper function to print the state of this object in the AccCache
+  void PrintCacheState(void)
+  {
+    MemoryManager::PrintState(this->_odata);
+  }
+
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -576,7 +576,8 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
-  skipPastObjectRecord(std::string(GRID_FIELD_NORM));
+  // in principle should do the line below, but that breaks backard compatibility with old data
+  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
+  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
+    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
+      unsigned int t0 = 0;
+      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
+    } else {
+      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
+    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated

    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }

-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }

-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -84,7 +84,8 @@ void acceleratorInit(void)
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
-    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: assume user either uses\n");
+    printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
@@ -109,6 +110,7 @@ void acceleratorInit(void)

 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
+hipStream_t copyStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -166,16 +168,25 @@ void acceleratorInit(void)
 #ifdef GRID_DEFAULT_GPU
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: using default device \n");
-    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
-    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+    printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
  }
+  int device = 0;
 #else
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
-    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
  }
-  hipSetDevice(rank);
+  int device = rank;
 #endif
+  hipSetDevice(device);
+  hipStreamCreate(&copyStream);
+  const int len=64;
+  char busid[len];
+  if( rank == world_rank ) { 
+    hipDeviceGetPCIBusId(busid, len, device);
+    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
+  }
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 }
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -230,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@@ -337,6 +338,7 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline

+extern hipStream_t copyStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -411,10 +413,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-inline void acceleratorCopySynchronise(void) {  }
+//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+//inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}

+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+{
+  hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
+}
+inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
+
 #endif

 //////////////////////////////////////////////
@@ -485,18 +493,12 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
-accelerator_inline void acceleratorSynchronise(void) 
+accelerator_inline void acceleratorSynchronise(void)  // Only Nvidia needs 
 {
 #ifdef GRID_SIMT
 #ifdef GRID_CUDA
  __syncwarp();
 #endif
-#ifdef GRID_SYCL
-  //cl::sycl::detail::workGroupBarrier();
-#endif
-#ifdef GRID_HIP
-  __syncthreads();
-#endif
 #endif
  return;
 }
--- a/examples/Example_Laplacian_solver.cc
+++ b/examples/Example_Laplacian_solver.cc
@@ -4,7 +4,7 @@ using namespace Grid;
 template<class Field>
 void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
 {
-    RealD cp, c, alpha, d, beta, ssq, qq;
+    RealD cp, c, alpha, d, beta, ssq;
    RealD Tolerance=1.0e-10;
    int MaxIterations=10000;
    
--- a/examples/Example_wall_wall_3pt.cc
+++ b/examples/Example_wall_wall_3pt.cc
@@ -0,0 +1,539 @@
+/*
+ * Warning: This code illustrative only: not well tested, and not meant for production use
+ * without regression / tests being applied
+ */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+typedef SpinColourMatrix Propagator;
+typedef SpinColourVector Fermion;
+typedef PeriodicGimplR   GimplR;
+
+template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  GridBase *grid;
+  GaugeField U;
+  
+  CovariantLaplacianCshift(GaugeField &_U)    :
+    grid(_U.Grid()),
+    U(_U) {  };
+
+  virtual GridBase *Grid(void) { return grid; };
+
+  virtual void  M    (const Field &in, Field &out)
+  {
+    out=Zero();
+    for(int mu=0;mu<Nd-1;mu++) {
+      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
+      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
+      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
+      out = out + 2.0*in;
+    }
+  };
+  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
+  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
+};
+
+void MakePhase(Coordinate mom,LatticeComplex &phase)
+{
+  GridBase *grid = phase.Grid();
+  auto latt_size = grid->GlobalDimensions();
+  ComplexD ci(0.0,1.0);
+  phase=Zero();
+
+  LatticeComplex coor(phase.Grid());
+  for(int mu=0;mu<Nd;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    phase = phase + (TwoPiL * mom[mu]) * coor;
+  }
+  phase = exp(phase*ci);
+}
+void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
+{
+  Smear_Stout<GimplR> Stout(rho);
+  LatticeGaugeField Utmp(Uin.Grid());
+  Utmp = Uin;
+  for(int i=0;i<nstep;i++){
+    Stout.smear(Usmr,Utmp);
+    Utmp = Usmr;
+  }
+}
+void PointSource(Coordinate &coor,LatticePropagator &source)
+{
+  //  Coordinate coor({0,0,0,0});
+  source=Zero();
+  SpinColourMatrix kronecker; kronecker=1.0;
+  pokeSite(kronecker,source,coor);
+}
+void GFWallSource(int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex one(grid); one = ComplexD(1.0,0.0);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+  LatticeCoordinate(t,Tdir);
+  one = where(t==Integer(tslice), one, zz);
+  source = 1.0;
+  source = source * one;
+}
+
+void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex noise(grid);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+
+  RealD nrm=1.0/sqrt(2);
+  bernoulli(RNG, noise); // 0,1 50:50
+
+  noise = (2.*noise - Complex(1,1))*nrm;
+
+  LatticeCoordinate(t,Tdir);
+  noise = where(t==Integer(tslice), noise, zz);
+
+  source = 1.0;
+  source = source*noise;
+  std::cout << " Z2 wall " << norm2(source) << std::endl;
+}
+void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
+{
+  Real alpha=0.05;
+
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
+
+  std::cout << " Initial plaquette "<<plaq << std::endl;
+
+  LatticeColourMatrix   xform(U.Grid()); 
+  Ufix = U;
+  int orthog=Nd-1;
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
+  
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
+
+  std::cout << " Final plaquette "<<plaq << std::endl;
+}
+template<class Field>
+void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
+{
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
+  Laplacian_t Laplacian(U);
+
+  Integer Iterations = 40;
+  Real width = 2.0;
+  Real coeff = (width*width) / Real(4*Iterations);
+
+  Field tmp(U.Grid());
+  smeared=unsmeared;
+  //  chi = (1-p^2/2N)^N kronecker
+  for(int n = 0; n < Iterations; ++n) {
+    Laplacian.M(smeared,tmp);
+    smeared = smeared - coeff*tmp;
+    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
+  }
+}
+void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
+{
+  LatticePropagator tmp(source.Grid());
+  PointSource(site,source);
+  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
+  tmp = source;
+  GaussianSmear(U,tmp,source);
+  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
+}
+void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
+{
+  Z2WallSource(RNG,tslice,source);
+  auto tmp = source;
+  GaussianSmear(U,tmp,source);
+}
+void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
+{
+  assert(mom.size()==Nd);
+  assert(mom[Tdir] == 0);
+
+  GridBase * grid = spectator.Grid();
+
+  LatticeInteger ts(grid);
+  LatticeCoordinate(ts,Tdir);
+  source = Zero();
+  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
+
+  LatticeComplex phase(grid);
+  MakePhase(mom,phase);
+
+  source = source *phase;
+}
+template<class Action>
+void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{
+  GridBase *UGrid = D.GaugeGrid();
+  GridBase *FGrid = D.FermionGrid();
+
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion src5  (FGrid); 
+  LatticeFermion result5(FGrid);
+  LatticeFermion result4(UGrid);
+  
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+
+      result5=Zero();
+      schur(D,src5,result5,ZG);
+      std::cout<<GridLogMessage
+	       <<"spin "<<s<<" color "<<c
+	       <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+}
+
+class MesonFile: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
+};
+
+void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
+  };
+
+  Gamma G5(Gamma::Algebra::Gamma5);
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+void Meson3pt(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaX},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaY},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaZ},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaT}
+  };
+
+  Gamma G5(Gamma::Algebra::Gamma5);
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+
+void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
+  };
+
+  Gamma G5(Gamma::Algebra::Gamma5);
+  int nt=q1.size();
+  std::vector<Complex> meson_CF(nt);
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
+      corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+int make_idx(int p, int m,int nmom)
+{
+  if (m==0) return p;
+  assert(p==0);
+  return nmom + m - 1;
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  // Double precision grids
+  auto latt = GridDefaultLatt();
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+
+  LatticeGaugeField Umu(UGrid);
+  LatticeGaugeField Utmp(UGrid);
+  LatticeGaugeField Usmr(UGrid);
+  std::string config;
+  if( argc > 1 && argv[1][0] != '-' )
+  {
+    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, argv[1]);
+    config=argv[1];
+  }
+  else
+  {
+    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
+    SU<Nc>::ColdConfiguration(Umu);
+    config="ColdConfig";
+  }
+  //  GaugeFix(Umu,Utmp);
+  //  Umu=Utmp;
+
+  int nsmr=3;
+  RealD rho=0.1;
+  LinkSmear(nsmr,rho,Umu,Usmr);
+
+
+  std::vector<int>   smeared_link({ 0,0,1} ); 
+  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
+  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
+  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
+  std::vector<RealD> cs   ({ 0.0,0.0,0.5} );  // DDM
+  std::vector<int>   Ls_s ({ 16,16,12} );
+  std::vector<GridCartesian *> FGrids;
+  std::vector<GridRedBlackCartesian *> FrbGrids;
+
+  std::vector<Coordinate> momenta;
+  momenta.push_back(Coordinate({0,0,0,0}));
+  momenta.push_back(Coordinate({1,0,0,0}));
+  momenta.push_back(Coordinate({2,0,0,0}));
+
+  int nmass = masses.size();
+  int nmom  = momenta.size();
+
+  std::vector<MobiusFermionR *> FermActs;
+  
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+  typedef MobiusFermionR FermionAction;
+  FermionAction::ImplParams Params(boundary);
+
+  for(int m=0;m<masses.size();m++) {
+
+    RealD mass = masses[m];
+    RealD M5   = M5s[m];
+    RealD b    = bs[m];
+    RealD c    = cs[m];
+    int   Ls   = Ls_s[m];
+
+    if ( smeared_link[m] ) Utmp = Usmr;
+    else                   Utmp = Umu;
+    
+    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
+    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
+
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
+  }
+
+  LatticePropagator z2wall_source(UGrid);
+  LatticePropagator gfwall_source(UGrid);
+  LatticePropagator phased_prop(UGrid);
+
+  int tslice = 0;
+  int tseq=(tslice+16)%latt[Nd-1];
+  //////////////////////////////////////////////////////////////////////
+  // RNG seeded for Z2 wall
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
+  Z2WallSource  (RNG4,tslice,z2wall_source);
+  GFWallSource  (tslice,gfwall_source);
+
+  std::vector<LatticeComplex> phase(nmom,UGrid);
+  for(int m=0;m<nmom;m++){
+    MakePhase(momenta[m],phase[m]);
+  }
+
+  std::vector<LatticePropagator> Z2Props   (nmom+nmass-1,UGrid);
+  std::vector<LatticePropagator> GFProps   (nmom+nmass-1,UGrid);
+  for(int p=0;p<nmom;p++) {
+    int m=0;
+    int idx = make_idx(p,m,nmom);
+    phased_prop = z2wall_source * phase[p];
+    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
+
+    phased_prop = gfwall_source * phase[p];
+    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
+  }
+  for(int m=1;m<nmass;m++) {
+    int p=0;
+    int idx = make_idx(p,m,nmom);
+    phased_prop = z2wall_source;
+    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
+
+    phased_prop = gfwall_source;
+    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
+  }
+
+  std::vector<std::vector<Propagator> > wsnk_z2Props(nmom+nmass-1);
+  std::vector<std::vector<Propagator> > wsnk_gfProps(nmom+nmass-1);
+
+  // Non-zero kaon and point and D two point
+  // WW stick momentum on m1 (lighter)
+  //     zero momentum on m2
+  for(int m1=0;m1<nmass;m1++) {
+  for(int m2=m1;m2<nmass;m2++) {
+    int pmax = (m1==0)? nmom:1;
+    for(int p=0;p<pmax;p++){
+
+      std::stringstream ssg,ssz;
+      std::stringstream wssg,wssz;
+
+      int idx1 = make_idx(p,m1,nmom);
+      int idx2 = make_idx(0,m2,nmom);
+
+      /// Point sinks
+      ssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
+      ssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
+      MesonTrace(ssz.str(),Z2Props[idx1],Z2Props[idx2],phase[p]); // Q1 is conjugated
+      MesonTrace(ssg.str(),GFProps[idx1],GFProps[idx2],phase[p]); 
+      
+      /// Wall sinks
+      wssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
+      wssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
+      
+      phased_prop = GFProps[m2] * phase[p];
+      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
+      sliceSum(GFProps[m1],wsnk_gfProps[m2],Tdir);
+      WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
+
+      phased_prop = Z2Props[m2] * phase[p];
+      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
+      sliceSum(Z2Props[m1],wsnk_gfProps[m2],Tdir);
+      WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
+    }
+  }}
+
+
+  /////////////////////////////////////
+  // Sequential solves
+  /////////////////////////////////////
+  LatticePropagator  seq_wsnk_z2src(UGrid);
+  LatticePropagator  seq_wsnk_gfsrc(UGrid);
+  LatticePropagator  seq_psnk_z2src(UGrid);
+  LatticePropagator  seq_psnk_gfsrc(UGrid);
+  LatticePropagator source(UGrid);
+  for(int m=0;m<nmass-1;m++){
+    int spect_idx = make_idx(0,m,nmom);
+    int charm=nmass-1;
+
+    SequentialSource(tseq,momenta[0],GFProps[spect_idx],source);
+    Solve(*FermActs[charm],source,seq_psnk_gfsrc);
+    
+    SequentialSource(tseq,momenta[0],Z2Props[spect_idx],source);
+    Solve(*FermActs[charm],source,seq_psnk_z2src);
+
+    // Todo need wall sequential solve
+    for(int p=0;p<nmom;p++){
+      int active_idx = make_idx(p,0,nmom);
+      std::stringstream seq_3pt_p_z2;
+      std::stringstream seq_3pt_p_gf;
+      std::stringstream seq_3pt_w_z2;
+      std::stringstream seq_3pt_w_gf;
+      seq_3pt_p_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_z2_meson.xml";
+      seq_3pt_p_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_gf_meson.xml";
+      seq_3pt_w_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_z2_meson.xml";
+      seq_3pt_w_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_gf_meson.xml";
+      Meson3pt(seq_3pt_p_gf.str(),GFProps[active_idx],seq_psnk_gfsrc,phase[p]);
+      Meson3pt(seq_3pt_p_z2.str(),Z2Props[active_idx],seq_psnk_z2src,phase[p]);
+    }    
+  }
+  
+  Grid_finalize();
+}
+
+
+
--- a/systems/Spock/comms.slurm
+++ b/systems/Spock/comms.slurm
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Begin LSF Directives
+#SBATCH -A LGT104
+#SBATCH -t 01:00:00
+##SBATCH -U openmpThu
+#SBATCH -p ecp
+#SBATCH -J comms
+#SBATCH -o comms.%J
+#SBATCH -e comms.%J
+#SBATCH -N 1
+#SBATCH -n 2
+
+DIR=.
+module list
+export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+export MPICH_GPU_SUPPORT_ENABLED=1
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export MPICH_SMP_SINGLE_COPY_MODE=NONE
+export OMP_NUM_THREADS=8
+
+AT=8
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
+PARAMS=" --accelerator-threads ${AT} --grid 64.64.32.32 --mpi 2.1.1.1 "
+srun -n2 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_comms_host_device $PARAMS
+
--- a/systems/Spock/config-command
+++ b/systems/Spock/config-command
@@ -0,0 +1,12 @@
+../../configure --enable-comms=mpi-auto \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--enable-simd=GPU \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
+--prefix=/ccs/home/chulwoo/Grid \
+ LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
--- a/systems/Spock/dwf.slurm
+++ b/systems/Spock/dwf.slurm
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Begin LSF Directives
+#SBATCH -A LGT104
+#SBATCH -t 01:00:00
+##SBATCH -U openmpThu
+#SBATCH -p ecp
+#SBATCH -J DWF
+#SBATCH -o DWF.%J
+#SBATCH -e DWF.%J
+#SBATCH -N 1
+#SBATCH -n 1
+
+DIR=.
+module list
+export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+export MPICH_GPU_SUPPORT_ENABLED=1
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#export MPICH_SMP_SINGLE_COPY_MODE=NONE
+export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export OMP_NUM_THREADS=8
+
+AT=8
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
+PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap"
+srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+
--- a/systems/Spock/dwf4.slurm
+++ b/systems/Spock/dwf4.slurm
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Begin LSF Directives
+#SBATCH -A LGT104
+#SBATCH -t 01:00:00
+##SBATCH -U openmpThu
+#SBATCH -p ecp
+#SBATCH -J DWF
+#SBATCH -o DWF.%J
+#SBATCH -e DWF.%J
+#SBATCH -N 1
+#SBATCH -n 4
+
+DIR=.
+module list
+export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+export MPICH_GPU_SUPPORT_ENABLED=1
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export MPICH_SMP_SINGLE_COPY_MODE=NONE
+#export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export OMP_NUM_THREADS=8
+
+AT=8
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
+PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
+srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+
--- a/systems/Spock/dwf8.slurm
+++ b/systems/Spock/dwf8.slurm
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Begin LSF Directives
+#SBATCH -A LGT104
+#SBATCH -t 01:00:00
+##SBATCH -U openmpThu
+#SBATCH -p ecp
+#SBATCH -J DWF
+#SBATCH -o DWF.%J
+#SBATCH -e DWF.%J
+#SBATCH -N 2
+#SBATCH -n 8
+
+DIR=.
+module list
+export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+export MPICH_GPU_SUPPORT_ENABLED=1
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export MPICH_SMP_SINGLE_COPY_MODE=NONE
+#export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export OMP_NUM_THREADS=8
+
+AT=8
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
+PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
+srun -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+
--- a/systems/Spock/mpiwrapper.sh
+++ b/systems/Spock/mpiwrapper.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+lrank=$SLURM_LOCALID
+
+export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID
+
+echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING"
+
+$*
+
+
+
--- a/systems/Spock/sourceme.sh
+++ b/systems/Spock/sourceme.sh
@@ -0,0 +1,5 @@
+module load PrgEnv-gnu
+module load rocm/4.3.0
+module load gmp
+module load cray-fftw
+module load craype-accel-amd-gfx908
--- a/tests/core/Test_contfrac_even_odd.cc
+++ b/tests/core/Test_contfrac_even_odd.cc
@@ -235,7 +235,6 @@ void  TestWhat(What & Ddwf,
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
-  RealD t1,t2;

  SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_dwf_eofa_even_odd.cc
+++ b/tests/core/Test_dwf_eofa_even_odd.cc
@@ -215,7 +215,6 @@ int main (int argc, char ** argv)
    pickCheckerboard(Odd , chi_o, chi);
    pickCheckerboard(Even, phi_e, phi);
    pickCheckerboard(Odd , phi_o, phi);
-    RealD t1,t2;

    SchurDiagMooeeOperator<DomainWallEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
    HermOpEO.MpcDagMpc(chi_e, dchi_e);
--- a/tests/core/Test_dwf_even_odd.cc
+++ b/tests/core/Test_dwf_even_odd.cc
@@ -212,8 +212,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
-  RealD t1,t2;
-

  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_gamma.cc
+++ b/tests/core/Test_gamma.cc
@@ -181,8 +181,8 @@ void checkAdj(const Gamma::Algebra a)

 void checkProject(GridSerialRNG &rng)
 {
-  SpinVector     rv, recon, full;
-  HalfSpinVector hsp, hsm;
+  SpinVector     rv, recon;
+  HalfSpinVector hsm;
  
  random(rng, rv);
  
--- a/tests/core/Test_gpwilson_even_odd.cc
+++ b/tests/core/Test_gpwilson_even_odd.cc
@@ -198,7 +198,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
-  RealD t1,t2;

  SchurDiagMooeeOperator<GparityWilsonFermionR,FermionField> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -364,14 +364,12 @@ int main(int argc, char **argv) {

      {  // Peek-ology and Poke-ology, with a little app-ology
        Complex c;
-        ColourMatrix c_m;
-        SpinMatrix s_m;
-        SpinColourMatrix sc_m;
+        ColourMatrix c_m = Zero();
+        SpinMatrix s_m = Zero();
+        SpinColourMatrix sc_m = Zero();

-        s_m = TensorIndexRecursion<ColourIndex>::traceIndex(
-            sc_m);  // Map to traceColour
-        c_m = TensorIndexRecursion<SpinIndex>::traceIndex(
-            sc_m);  // map to traceSpin
+        s_m = TensorIndexRecursion<ColourIndex>::traceIndex(sc_m);  // Map to traceColour
+        c_m = TensorIndexRecursion<SpinIndex>::traceIndex(sc_m);  // map to traceSpin

        c = TensorIndexRecursion<SpinIndex>::traceIndex(s_m);
        c = TensorIndexRecursion<ColourIndex>::traceIndex(c_m);
--- a/tests/core/Test_mobius_eofa_even_odd.cc
+++ b/tests/core/Test_mobius_eofa_even_odd.cc
@@ -217,7 +217,6 @@ int main (int argc, char ** argv)
    pickCheckerboard(Odd , chi_o, chi);
    pickCheckerboard(Even, phi_e, phi);
    pickCheckerboard(Odd , phi_o, phi);
-    RealD t1,t2;

    SchurDiagMooeeOperator<MobiusEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
    HermOpEO.MpcDagMpc(chi_e, dchi_e);
--- a/tests/core/Test_mobius_even_odd.cc
+++ b/tests/core/Test_mobius_even_odd.cc
@@ -262,7 +262,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
-  RealD t1,t2;


  SchurDiagMooeeOperator<MobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
--- a/tests/core/Test_staggered.cc
+++ b/tests/core/Test_staggered.cc
@@ -144,7 +144,7 @@ int main (int argc, char ** argv)
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
-  double t2;
+
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
--- a/tests/core/Test_staggered5D.cc
+++ b/tests/core/Test_staggered5D.cc
@@ -162,7 +162,6 @@ int main (int argc, char ** argv)
  }
  double t1=usecond();

-  double t2;
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
--- a/tests/core/Test_staggered_naive.cc
+++ b/tests/core/Test_staggered_naive.cc
@@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;

 int main (int argc, char ** argv)
 {
@@ -135,7 +134,6 @@ int main (int argc, char ** argv)
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
-  double t2;
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
--- a/tests/core/Test_wilson_even_odd.cc
+++ b/tests/core/Test_wilson_even_odd.cc
@@ -204,7 +204,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
-  RealD t1,t2;

  SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_wilson_twisted_mass_even_odd.cc
+++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc
@@ -205,7 +205,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
-  RealD t1,t2;

  SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_zmobius_even_odd.cc
+++ b/tests/core/Test_zmobius_even_odd.cc
@@ -276,7 +276,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
-  RealD t1,t2;


  SchurDiagMooeeOperator<ZMobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@@ -57,7 +57,6 @@ int main (int argc, char ** argv)
  SU<Nc>::HotConfiguration(pRNG,U);
  
  double beta = 1.0;
-  double c1   = -0.331;

  IwasakiGaugeActionR Action(beta);
  //  PlaqPlusRectangleActionR Action(beta,c1);
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
@@ -40,6 +40,7 @@ using namespace Grid;
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -67,6 +68,8 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator ();
+  
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_16_rb.cc
+++ b/tests/solver/Test_dwf_hdcr_16_rb.cc
@@ -56,9 +56,9 @@ template<class Field> class SolverWrapper : public LinearFunction<Field> {
 private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
-public:
-
-  /////////////////////////////////////////////////////
+public: 
+  using LinearFunction<Field>::operator();
+ /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
  SolverWrapper(CheckerBoardedSparseMatrixBase<Field> &Matrix,
@@ -75,6 +75,7 @@ public:
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -98,6 +99,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -128,6 +130,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_24_regression.cc
+++ b/tests/solver/Test_dwf_hdcr_24_regression.cc
@@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,8 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_2level.cc
+++ b/tests/solver/Test_dwf_hdcr_2level.cc
@@ -56,6 +56,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -79,6 +80,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,7 @@ public:
 template<class Field,class Matrix> class RedBlackSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  RealD tol;
@@ -134,6 +137,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@@ -241,7 +245,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  const int Ls=16;
-  const int rLs=8;
+  //  const int rLs=8;

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -388,7 +392,7 @@ int main (int argc, char ** argv)
  //  RedBlackSmoother<LatticeFermion,DomainWallFermionR> FineRBSmoother(0.00,0.001,100,Ddwf);

  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
 			    HermIndefOp,Ddwf,
 			    FineSmoother,
--- a/tests/solver/Test_dwf_hdcr_48_rb.cc
+++ b/tests/solver/Test_dwf_hdcr_48_rb.cc
@@ -57,7 +57,7 @@ private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@@ -75,6 +75,7 @@ public:
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -98,6 +99,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -128,6 +130,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_48_regression.cc
+++ b/tests/solver/Test_dwf_hdcr_48_regression.cc
@@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@@ -108,6 +110,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_multigrid.cc
+++ b/tests/solver/Test_dwf_multigrid.cc
@@ -57,6 +57,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
+  using LinearFunction<Field>::operator();

  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
@@ -118,6 +119,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -174,6 +176,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_hw_multigrid_mixed_48.cc
+++ b/tests/solver/Test_hw_multigrid_mixed_48.cc
@@ -456,8 +456,8 @@ public:

    siteVector *CBp=Stencil.CommBuf();			

-    int ptype;
-    int nb2=nbasis/2;
+    //    int ptype;
+    //    int nb2=nbasis/2;
    
    autoView(in_v ,   in, AcceleratorRead);
    autoView(st, Stencil, AcceleratorRead);
@@ -471,7 +471,7 @@ public:
 	  typedef decltype(coalescedRead(in_v[0])) calcVector;
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int sU = sF/Ls;
-	  int  s = sF%Ls;
+	  //	  int  s = sF%Ls;

 	  calcComplex res = Zero();
 	  calcVector  nbr;
@@ -517,14 +517,14 @@ public:
    autoView(st, Stencil, AcceleratorRead);
    siteVector *CBp=Stencil.CommBuf();			

-    int ptype;
-    int nb2=nbasis/2;
+    //    int ptype;
+    //    int nb2=nbasis/2;
    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {

      typedef decltype(coalescedRead(in_v[0])) calcVector;
      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
      int sU = sF/Ls;
-      int  s = sF%Ls;
+      //      int  s = sF%Ls;

      calcComplex res = Zero();

@@ -650,7 +650,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@@ -712,6 +712,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -735,6 +736,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@@ -831,6 +833,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@@ -1174,18 +1177,18 @@ int main (int argc, char ** argv)
  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);

-  int cNconv;
  cNm=0;
  std::vector<RealD>          eval2(cNm);
  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
  cc_src=1.0;
+  //  int cNconv;
  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);

  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(0.02,10000);
  DeflatedGuesser<CoarseCoarseVector> DeflCoarseCoarseGuesser(evec2,eval2);
  NormalEquations<CoarseCoarseVector> DeflCoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,DeflCoarseCoarseGuesser);

-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
--- a/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
+++ b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
@@ -456,8 +456,8 @@ public:

    siteVector *CBp=Stencil.CommBuf();			

-    int ptype;
-    int nb2=nbasis/2;
+    //int ptype;
+    //    int nb2=nbasis/2;
    
    autoView(in_v ,   in, AcceleratorRead);
    autoView(st, Stencil, AcceleratorRead);
@@ -471,7 +471,7 @@ public:
 	  typedef decltype(coalescedRead(in_v[0])) calcVector;
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int sU = sF/Ls;
-	  int  s = sF%Ls;
+	  //	  int  s = sF%Ls;

 	  calcComplex res = Zero();
 	  calcVector  nbr;
@@ -517,14 +517,14 @@ public:
    autoView(st, Stencil, AcceleratorRead);
    siteVector *CBp=Stencil.CommBuf();			

-    int ptype;
-    int nb2=nbasis/2;
+    //    int ptype;
+    //    int nb2=nbasis/2;
    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {

      typedef decltype(coalescedRead(in_v[0])) calcVector;
      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
      int sU = sF/Ls;
-      int  s = sF%Ls;
+      //      int  s = sF%Ls;

      calcComplex res = Zero();

@@ -648,7 +648,7 @@ private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@@ -669,6 +669,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
+  using LinearFunction<Field>::operator();

  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
@@ -731,6 +732,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
+  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@@ -754,6 +756,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
+  using LinearFunction<Lattice<Fobj> >::operator();

  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@@ -850,7 +853,8 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
-
+  using LinearFunction<Lattice<Fobj> >::operator();
+  
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
@@ -1194,11 +1198,11 @@ int main (int argc, char ** argv)
  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);

-  int cNconv;
  cNm=0;
  std::vector<RealD>          eval2(cNm);
  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
  cc_src=1.0;
+  //  int cNconv;
  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);
  
  std::vector<RealD> tols ({0.005,0.001});
@@ -1218,10 +1222,10 @@ int main (int argc, char ** argv)
  for(auto c_hi : c_his ) {
  for(auto f_lo : f_los ) {
  for(auto f_hi : f_his ) {
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
-  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
+    //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+    //  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(tol,10000);
-  ZeroGuesser<CoarseCoarseVector> CoarseCoarseGuesser;
+  //  ZeroGuesser<CoarseCoarseVector> CoarseCoarseGuesser;
  SchurRedBlackDiagMooeeSolve<CoarseCoarseVector> CoarseCoarseRBCG(CoarseCoarseCG);
  SchurSolverWrapper<CoarseCoarseVector> CoarseCoarseSolver(cc_Dwf,CoarseCoarseRBCG);

--- a/tests/solver/Test_multigrid_common.h
+++ b/tests/solver/Test_multigrid_common.h
@@ -143,6 +143,7 @@ public:

 template<class Field> class MultiGridPreconditionerBase : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  virtual ~MultiGridPreconditionerBase()               = default;
  virtual void setup()                                 = 0;
  virtual void operator()(Field const &in, Field &out) = 0;
@@ -156,6 +157,7 @@ public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
+  using MultiGridPreconditionerBase<Lattice<Fobj>>::operator();

  // clang-format off
  typedef Aggregation<Fobj, CComplex, nBasis>                                                                         Aggregates;
@@ -568,6 +570,7 @@ public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
+  using MultiGridPreconditionerBase<Lattice<Fobj>>::operator();

  typedef Matrix        FineDiracMatrix;
  typedef Lattice<Fobj> FineVector;
--- a/tests/solver/Test_wilson_qmr_unprec.cc
+++ b/tests/solver/Test_wilson_qmr_unprec.cc
@@ -56,7 +56,6 @@ int main (int argc, char ** argv)
  QuasiMinimalResidual<LatticeFermion> QMR(1.0e-8,10000);
  
  RealD mass=0.0;
-  RealD M5=1.8;
  WilsonFermionR Dw(Umu,*Grid,*rbGrid,mass);

  NonHermitianLinearOperator<WilsonFermionR,LatticeFermion> NonHermOp(Dw);