Merge branch 'develop' of github.com:paboyle/Grid into develop

2025-07-08 09:27:06 +01:00 · 2021-10-29 13:01:34 +01:00
parent b27b12828e fe9edf8526
commit a65a497bae
64 changed files with 1580 additions and 302 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];
 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
  virtual ~LinearOperatorBase(){};
 };
@ -507,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@ -585,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@ -598,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+template<class Field> using Preconditioner =  LinearFunction<Field> ;
 /*
 template<class Field> class Preconditioner :  public LinearFunction<Field> {
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
 */
 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
  virtual ~SparseMatrixBase() {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
@ -57,6 +60,7 @@ private:
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@ -87,6 +91,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@ -67,6 +67,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@ -97,6 +98,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;
@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
@ -170,7 +171,7 @@ public:
    LinalgTimer.Start();
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    //p[0],q[0],qq[0] 
@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);
      LinalgTimer.Start();
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -159,7 +159,6 @@ void MemoryManager::Init(void)
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -389,7 +389,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    acceleratorCopySynchronise(); // MPI prob slower
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@ -405,6 +404,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  acceleratorCopySynchronise(); 
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
  if (warpSize != WARP_SIZE) {
@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
    exit(EXIT_FAILURE);
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
  autoView(full_v, full, AcceleratorRead);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++) {
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(half_v[ssh],full_v(ss));
    }
  });
 }
 template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
  autoView(full_v , full, AcceleratorWrite);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++){
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(full_v[ss],half_v(ssh));
    }
  });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
      unsigned int t0 = 0;
      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
    } else {
      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_32=ref()(3)(2);}
 #define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);	\
+  permute##dir(Chi_00,Chi_00);			\
-      permute##dir(Chi_01,Chi_01);\
+  permute##dir(Chi_01,Chi_01);			\
-      permute##dir(Chi_02,Chi_02);\
+  permute##dir(Chi_02,Chi_02);			\
-      permute##dir(Chi_10,Chi_10);	\
+  permute##dir(Chi_10,Chi_10);			\
-      permute##dir(Chi_11,Chi_11);\
+  permute##dir(Chi_11,Chi_11);			\
-      permute##dir(Chi_12,Chi_12);
+  permute##dir(Chi_12,Chi_12);
 #endif
@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;
 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  {int ptype;					\
-  offset = SE->_offset;				\
+   SE=st.GetEntry(ptype,DIR,ss);		\
-  local  = SE->_is_local;			\
+   auto offset = SE->_offset;			\
-  perm   = SE->_permute;			\
+   auto local  = SE->_is_local;			\
-  if ( local ) {				\
+   auto perm   = SE->_permute;			\
-    LOAD_CHIMU(PERM);				\
+   if ( local ) {				\
-    PROJ;					\
+     LOAD_CHIMU(PERM);				\
-    if ( perm) {				\
+     PROJ;					\
-      PERMUTE_DIR(PERM);			\
+     if ( perm) {				\
-    }						\
+       PERMUTE_DIR(PERM);			\
-  } else {					\
+     }						\
-    LOAD_CHI;					\
+   } else {					\
-  }						\
+     LOAD_CHI;					\
-  acceleratorSynchronise();			\
+   }						\
-  MULT_2SPIN(DIR);				\
+   acceleratorSynchronise();			\
-  RECON;					
+   MULT_2SPIN(DIR);				\
   RECON;					}
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
-  SE=&st_p[DIR+8*ss];				\
+  { SE=&st_p[DIR+8*ss];						\
-  ptype=st_perm[DIR];				\
+  auto ptype=st_perm[DIR];					\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;					\
-  local  = SE->_is_local;			\
+  auto local  = SE->_is_local;					\
-  perm   = SE->_permute;			\
+  auto perm   = SE->_permute;					\
-  if ( local ) {				\
+  if ( local ) {						\
-    LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);						\
-    PROJ;					\
+    PROJ;							\
-    if ( perm) {				\
+    if ( perm) {						\
-      PERMUTE_DIR(PERM);			\
+      PERMUTE_DIR(PERM);					\
-    }						\
+    }								\
-  } else {					\
+  } else {							\
-    LOAD_CHI;					\
+    LOAD_CHI;							\
-  }						\
+  }								\
-  acceleratorSynchronise();			\
+  acceleratorSynchronise();					\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN(DIR);						\
-  RECON;					
+  RECON;					}
 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
+  { SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
+    auto ptype=st_perm[DIR];						\
- /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
+    auto offset = SE->_offset;						\
-  perm   = SE->_permute;			\
+    auto perm   = SE->_permute;						\
-  LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);							\
-  PROJ;						\
+    PROJ;								\
-  MULT_2SPIN(DIR);				\
+    MULT_2SPIN(DIR);							\
-  RECON;					
+    RECON;					}
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  local  = SE->_is_local;			\
+  auto offset = SE->_offset;					\
-  perm   = SE->_permute;			\
+  auto local  = SE->_is_local;					\
-  if ( local ) {				\
+  auto perm   = SE->_permute;					\
-    LOAD_CHIMU(PERM);				\
+  if ( local ) {						\
-    PROJ;					\
+    LOAD_CHIMU(PERM);						\
-    if ( perm) {				\
+    PROJ;							\
-      PERMUTE_DIR(PERM);			\
+    if ( perm) {						\
-    }						\
+      PERMUTE_DIR(PERM);					\
-  } else if ( st.same_node[DIR] ) {		\
+    }								\
-    LOAD_CHI;					\
+  } else if ( st.same_node[DIR] ) {				\
-  }						\
+    LOAD_CHI;							\
-  acceleratorSynchronise();			\
+  }								\
-  if (local || st.same_node[DIR] ) {		\
+  acceleratorSynchronise();					\
-    MULT_2SPIN(DIR);				\
+  if (local || st.same_node[DIR] ) {				\
-    RECON;					\
+    MULT_2SPIN(DIR);						\
-  }						\
+    RECON;							\
-  acceleratorSynchronise();			
+  }								\
  acceleratorSynchronise();			}
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+  auto offset = SE->_offset;				\
-    LOAD_CHI;					\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
-    MULT_2SPIN(DIR);				\
+    LOAD_CHI;						\
-    RECON;					\
+    MULT_2SPIN(DIR);					\
-    nmu++;					\
+    RECON;						\
-  }						\
+    nmu++;						\
-  acceleratorSynchronise();			
+  }							\
  acceleratorSynchronise();			}
-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss)					\
-  {						\
+  {							\
-    SiteSpinor & ref (out[ss]);			\
+    SiteSpinor & ref (out[ss]);				\
    coalescedWrite(ref()(0)(0),result_00,lane);		\
    coalescedWrite(ref()(0)(1),result_01,lane);		\
    coalescedWrite(ref()(0)(2),result_02,lane);		\
@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@ -78,6 +78,8 @@ public:
  typedef Lattice<SiteLink>    LinkField; 
  typedef Lattice<SiteField>   Field;
  typedef SU<Nrepresentation> Group;
  // Guido: we can probably separate the types from the HMC functions
  // this will create 2 kind of implementations
  // probably confusing the users
@ -118,7 +120,7 @@ public:
    LinkField Pmu(P.Grid());
    Pmu = Zero();
    for (int mu = 0; mu < Nd; mu++) {
-      SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
      Pmu = Pmu*scale;
      PokeIndex<LorentzIndex>(P, Pmu, mu);
@ -159,15 +161,15 @@ public:
  }
  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::HotConfiguration(pRNG, U);
+    Group::HotConfiguration(pRNG, U);
  }
  static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::TepidConfiguration(pRNG, U);
+    Group::TepidConfiguration(pRNG, U);
  }
  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::ColdConfiguration(pRNG, U);
+    Group::ColdConfiguration(pRNG, U);
  }
 };
--- a/Grid/qcd/hmc/UsingHMC.md
+++ b/Grid/qcd/hmc/UsingHMC.md
@ -1,61 +1,63 @@
-Using HMC in Grid version 0.5.1
+# Using HMC in Grid
-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
+These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
+Disclaimer: Grid is still under active development so any information here can be changed in future releases.
-Command line options
+## Command line options
-===================
+
-(relevant file GenericHMCrunner.h)
+(relevant file `GenericHMCrunner.h`)
 The initial configuration can be changed at the command line using 
--StartType <your choice>
+`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
-valid choices, one among these
+`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
-HotStart, ColdStart, TepidStart, CheckpointStart
+Default: `--StartingType HotStart`
 default: HotStart
-example
+Example:
-./My_hmc_exec  --StartType HotStart
+```
 ./My_hmc_exec  --StartingType HotStart
 ```
-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
+The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
+`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
-default: 0
+Default: `--StartingTrajectory 0`
 The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
+`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
-default: 1
+Default: `--Trajectories 1`
 The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
+`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
-default: 10
+Default: `--Thermalizations 10`
 Any other parameter is defined in the source for the executable.
-HMC controls
+## HMC controls
 ===========
 The lines 
 ```
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
  std::vector<int> ParSeed({6, 7, 8, 9, 10});
 ```
 define the seeds for the serial and the parallel RNG.
 The line 
 ```
  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
 ```
 declares the number of molecular dynamics steps and the total trajectory length.
-Actions
+## Actions
 ======
-Action names are defined in the file
+Action names are defined in the directory `Grid/qcd/action`.
 lib/qcd/Actions.h
-Gauge actions list:
+Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):
 ```
 WilsonGaugeActionR;
 WilsonGaugeActionF;
 WilsonGaugeActionD;
@ -68,8 +70,9 @@ IwasakiGaugeActionD;
 SymanzikGaugeActionR;
 SymanzikGaugeActionF;
 SymanzikGaugeActionD;
 ```
-
+```
 ConjugateWilsonGaugeActionR;
 ConjugateWilsonGaugeActionF;
 ConjugateWilsonGaugeActionD;
@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
 ConjugateSymanzikGaugeActionR;
 ConjugateSymanzikGaugeActionF;
 ConjugateSymanzikGaugeActionD;
 ```
 Each of these action accepts one single parameter at creation time (beta).
 Example for creating a Symanzik action with beta=4.0
 ```
  SymanzikGaugeActionR(4.0)
 ```
 Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
 ```
 ScalarActionR;
 ScalarActionF;
 ScalarActionD;
 ```
-
+The suffixes `R`, `F`, `D` in the action names refer to the `Real`
-each of these action accept one single parameter at creation time (beta).
+(the precision is defined at compile time by the `--enable-precision` flag in the configure),
-Example for creating a Symanzik action with beta=4.0
+`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
 	SymanzikGaugeActionR(4.0)
 The suffixes R,F,D in the action names refer to the Real
 (the precision is defined at compile time by the --enable-precision flag in the configure),
 Float and Double, that force the precision of the action to be 32, 64 bit respectively.
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -322,8 +322,8 @@ public:
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
-    int recv_from_rank;
+    //    int recv_from_rank;
-    int xmit_to_rank;
+    //    int xmit_to_rank;
    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }
-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }
-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -95,7 +95,7 @@ void acceleratorInit(void)
 #endif
  cudaSetDevice(device);
-
+  cudaStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 #ifdef GRID_CUDA
 #include <cuda.h>
 #ifdef __CUDA_ARCH__
@ -133,11 +134,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
    std::cout << "========================== CUDA KERNEL CALL\n";	\
    cuda_mem();								\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
    cuda_mem();								\
    std::cout << "========================== CUDA KERNEL DONE\n";	\
  }
 #define accelerator_for6dNB(iter1, num1,				\
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@ -88,7 +88,7 @@ public:
 // Coordinate class, maxdims = 8 for now.
 ////////////////////////////////////////////////////////////////
 #define GRID_MAX_LATTICE_DIMENSION (8)
-#define GRID_MAX_SIMD              (16)
+#define GRID_MAX_SIMD              (32)
 static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@ -137,7 +137,7 @@ int main (int argc, char ** argv)
  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
-  double          n = BENCH_IO_NPASS;
+  //  double          n = BENCH_IO_NPASS;
  stats(mean, stdDev, perf);
  stats(avMean, avStdDev, avPerf);
@ -164,7 +164,7 @@ int main (int argc, char ** argv)
                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s\n",
              "L", "std read", "std write", "Grid read", "Grid write");
@ -185,7 +185,7 @@ int main (int argc, char ** argv)
              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
  MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%12s %12s %12s %12s\n",
              "std read", "std write", "Grid read", "Grid write");
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@ -142,7 +142,7 @@ public:
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
-	int ncomm;
+	//	int ncomm;
 	double dbytes;
        for(int dir=0;dir<8;dir++) {
@ -290,7 +290,7 @@ public:
      LatticeSU4 z(&Grid); z=Zero();
      LatticeSU4 x(&Grid); x=Zero();
      LatticeSU4 y(&Grid); y=Zero();
-      double a=2.0;
+      //      double a=2.0;
      uint64_t Nloop=NLOOP;
--- a/benchmarks/Benchmark_comms_host_device.cc
+++ b/benchmarks/Benchmark_comms_host_device.cc
@ -72,7 +72,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
-  time_statistics timestat;
+  //  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -184,8 +184,10 @@ int main (int argc, char ** argv)
      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
-
+	       <<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
 	       <<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
      assert(nn==nn);
  }    
  Grid_finalize();
--- a/examples/Example_Laplacian_solver.cc
+++ b/examples/Example_Laplacian_solver.cc
@ -4,7 +4,7 @@ using namespace Grid;
 template<class Field>
 void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
 {
-    RealD cp, c, alpha, d, beta, ssq, qq;
+    RealD cp, c, alpha, d, beta, ssq;
    RealD Tolerance=1.0e-10;
    int MaxIterations=10000;
--- a/examples/Example_wall_wall_3pt.cc
+++ b/examples/Example_wall_wall_3pt.cc
@ -0,0 +1,539 @@
 /*
 * Warning: This code illustrative only: not well tested, and not meant for production use
 * without regression / tests being applied
 */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out + 2.0*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 void MakePhase(Coordinate mom,LatticeComplex &phase)
 {
  GridBase *grid = phase.Grid();
  auto latt_size = grid->GlobalDimensions();
  ComplexD ci(0.0,1.0);
  phase=Zero();
  LatticeComplex coor(phase.Grid());
  for(int mu=0;mu<Nd;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    phase = phase + (TwoPiL * mom[mu]) * coor;
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
  source=Zero();
  SpinColourMatrix kronecker; kronecker=1.0;
  pokeSite(kronecker,source,coor);
 }
 void GFWallSource(int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex one(grid); one = ComplexD(1.0,0.0);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  LatticeCoordinate(t,Tdir);
  one = where(t==Integer(tslice), one, zz);
  source = 1.0;
  source = source * one;
 }
 void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex noise(grid);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  RealD nrm=1.0/sqrt(2);
  bernoulli(RNG, noise); // 0,1 50:50
  noise = (2.*noise - Complex(1,1))*nrm;
  LatticeCoordinate(t,Tdir);
  noise = where(t==Integer(tslice), noise, zz);
  source = 1.0;
  source = source*noise;
  std::cout << " Z2 wall " << norm2(source) << std::endl;
 }
 void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
  Real width = 2.0;
  Real coeff = (width*width) / Real(4*Iterations);
  Field tmp(U.Grid());
  smeared=unsmeared;
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(smeared,tmp);
    smeared = smeared - coeff*tmp;
    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
  }
 }
 void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
 {
  LatticePropagator tmp(source.Grid());
  PointSource(site,source);
  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
  tmp = source;
  GaussianSmear(U,tmp,source);
  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
 }
 void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
 {
  Z2WallSource(RNG,tslice,source);
  auto tmp = source;
  GaussianSmear(U,tmp,source);
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
  assert(mom.size()==Nd);
  assert(mom[Tdir] == 0);
  GridBase * grid = spectator.Grid();
  LatticeInteger ts(grid);
  LatticeCoordinate(ts,Tdir);
  source = Zero();
  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
  LatticeComplex phase(grid);
  MakePhase(mom,phase);
  source = source *phase;
 }
 template<class Action>
 void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
 {
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
  LatticeFermion src4  (UGrid); 
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
      D.ImportPhysicalFermionSource(src4,src5);
      result5=Zero();
      schur(D,src5,result5,ZG);
      std::cout<<GridLogMessage
 	       <<"spin "<<s<<" color "<<c
 	       <<" norm2(src5d) "   <<norm2(src5)
               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
      D.ExportPhysicalFermionSolution(result5,result4);
      FermToProp<Action>(propagator,result4,s,c);
    }
  }
 }
 class MesonFile: Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
 };
 void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void Meson3pt(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaX},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaY},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaZ},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaT}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  int nt=q1.size();
  std::vector<Complex> meson_CF(nt);
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
      corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 int make_idx(int p, int m,int nmom)
 {
  if (m==0) return p;
  assert(p==0);
  return nmom + m - 1;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  // Double precision grids
  auto latt = GridDefaultLatt();
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  LatticeGaugeField Umu(UGrid);
  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
    FieldMetaData header;
    NerscIO::readConfiguration(Umu, header, argv[1]);
    config=argv[1];
  }
  else
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
    config="ColdConfig";
  }
  //  GaugeFix(Umu,Utmp);
  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  LinkSmear(nsmr,rho,Umu,Usmr);
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
  std::vector<RealD> cs   ({ 0.0,0.0,0.5} );  // DDM
  std::vector<int>   Ls_s ({ 16,16,12} );
  std::vector<GridCartesian *> FGrids;
  std::vector<GridRedBlackCartesian *> FrbGrids;
  std::vector<Coordinate> momenta;
  momenta.push_back(Coordinate({0,0,0,0}));
  momenta.push_back(Coordinate({1,0,0,0}));
  momenta.push_back(Coordinate({2,0,0,0}));
  int nmass = masses.size();
  int nmom  = momenta.size();
  std::vector<MobiusFermionR *> FermActs;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
    RealD mass = masses[m];
    RealD M5   = M5s[m];
    RealD b    = bs[m];
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
  LatticePropagator phased_prop(UGrid);
  int tslice = 0;
  int tseq=(tslice+16)%latt[Nd-1];
  //////////////////////////////////////////////////////////////////////
  // RNG seeded for Z2 wall
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticeComplex> phase(nmom,UGrid);
  for(int m=0;m<nmom;m++){
    MakePhase(momenta[m],phase[m]);
  }
  std::vector<LatticePropagator> Z2Props   (nmom+nmass-1,UGrid);
  std::vector<LatticePropagator> GFProps   (nmom+nmass-1,UGrid);
  for(int p=0;p<nmom;p++) {
    int m=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  for(int m=1;m<nmass;m++) {
    int p=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source;
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source;
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  std::vector<std::vector<Propagator> > wsnk_z2Props(nmom+nmass-1);
  std::vector<std::vector<Propagator> > wsnk_gfProps(nmom+nmass-1);
  // Non-zero kaon and point and D two point
  // WW stick momentum on m1 (lighter)
  //     zero momentum on m2
  for(int m1=0;m1<nmass;m1++) {
  for(int m2=m1;m2<nmass;m2++) {
    int pmax = (m1==0)? nmom:1;
    for(int p=0;p<pmax;p++){
      std::stringstream ssg,ssz;
      std::stringstream wssg,wssz;
      int idx1 = make_idx(p,m1,nmom);
      int idx2 = make_idx(0,m2,nmom);
      /// Point sinks
      ssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
      ssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
      MesonTrace(ssz.str(),Z2Props[idx1],Z2Props[idx2],phase[p]); // Q1 is conjugated
      MesonTrace(ssg.str(),GFProps[idx1],GFProps[idx2],phase[p]); 
      /// Wall sinks
      wssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
      wssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
      phased_prop = GFProps[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(GFProps[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
      phased_prop = Z2Props[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(Z2Props[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
    }
  }}
  /////////////////////////////////////
  // Sequential solves
  /////////////////////////////////////
  LatticePropagator  seq_wsnk_z2src(UGrid);
  LatticePropagator  seq_wsnk_gfsrc(UGrid);
  LatticePropagator  seq_psnk_z2src(UGrid);
  LatticePropagator  seq_psnk_gfsrc(UGrid);
  LatticePropagator source(UGrid);
  for(int m=0;m<nmass-1;m++){
    int spect_idx = make_idx(0,m,nmom);
    int charm=nmass-1;
    SequentialSource(tseq,momenta[0],GFProps[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_gfsrc);
    SequentialSource(tseq,momenta[0],Z2Props[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_z2src);
    // Todo need wall sequential solve
    for(int p=0;p<nmom;p++){
      int active_idx = make_idx(p,0,nmom);
      std::stringstream seq_3pt_p_z2;
      std::stringstream seq_3pt_p_gf;
      std::stringstream seq_3pt_w_z2;
      std::stringstream seq_3pt_w_gf;
      seq_3pt_p_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_z2_meson.xml";
      seq_3pt_p_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_gf_meson.xml";
      seq_3pt_w_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_z2_meson.xml";
      seq_3pt_w_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_gf_meson.xml";
      Meson3pt(seq_3pt_p_gf.str(),GFProps[active_idx],seq_psnk_gfsrc,phase[p]);
      Meson3pt(seq_3pt_p_z2.str(),Z2Props[active_idx],seq_psnk_z2src,phase[p]);
    }    
  }
  Grid_finalize();
 }
--- a/examples/Example_wall_wall_spectrum.cc
+++ b/examples/Example_wall_wall_spectrum.cc
@ -9,6 +9,7 @@ using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
@ -55,6 +56,16 @@ void MakePhase(Coordinate mom,LatticeComplex &phase)
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
@ -97,23 +108,23 @@ void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
-  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Ufix);
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
@ -167,19 +178,21 @@ void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
-  LatticeFermion src4  (UGrid); 
+  LatticeFermion src4  (UGrid); src4 = Zero();
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  std::cout<<GridLogMessage<< " source4 "<<norm2(source)<<std::endl;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
-
+      std::cout<<GridLogMessage<< s<<c<<" src4 "<<norm2(src4)<<std::endl;
      D.ImportPhysicalFermionSource(src4,src5);
      std::cout<<GridLogMessage<< s<<c<<" src5 "<<norm2(src5)<<std::endl;
      result5=Zero();
      schur(D,src5,result5,ZG);
@ -287,15 +300,10 @@ int main (int argc, char ** argv)
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  std::vector<int> seeds4({1,2,3,4}); 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  LatticeGaugeField Ufixed(UGrid);
+  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
@ -308,13 +316,20 @@ int main (int argc, char ** argv)
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
-    //    SU<Nc>::HotConfiguration(RNG4,Umu);
+    config="ColdConfig";
    config="HotConfig";
  }
-  GaugeFix(Umu,Ufixed);
+  //  GaugeFix(Umu,Utmp);
-  Umu=Ufixed;
+  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  RealD plaq_gf =WilsonLoops<GimplR>::avgPlaquette(Umu);
  LinkSmear(nsmr,rho,Umu,Usmr);
  RealD plaq_smr=WilsonLoops<GimplR>::avgPlaquette(Usmr);
  std::cout << GridLogMessage << " GF Plaquette " <<plaq_gf<<std::endl;
  std::cout << GridLogMessage << " SM Plaquette " <<plaq_smr<<std::endl;
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
@ -330,6 +345,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
@ -339,30 +357,40 @@ int main (int argc, char ** argv)
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
-    FermActs.push_back(new MobiusFermionR(Umu,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator point_source(UGrid);
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
-  Coordinate Origin({0,0,0,0});
+  int tslice = 0;
-  PointSource   (Origin,point_source);
+  //////////////////////////////////////////////////////////////////////
-  Z2WallSource  (RNG4,0,z2wall_source);
+  // RNG seeded for Z2 wall
-  GFWallSource  (0,gfwall_source);
+  //////////////////////////////////////////////////////////////////////
-  
+  // You can manage seeds however you like.
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  // Recommend SeedUniqueString.
-  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
+  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  std::vector<LatticePropagator> GFProps   (nmass,UGrid);
  for(int m=0;m<nmass;m++) {
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<<std::endl;
    Solve(*FermActs[m],z2wall_source    ,Z2Props[m]);
    std::cout << GridLogMessage << " Mass " <<m << " gfwall source "<<norm2(gfwall_source)<<std::endl;
    Solve(*FermActs[m],gfwall_source    ,GFProps[m]);
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<< " " << norm2(gfwall_source)<<std::endl;
  }
@ -383,14 +411,15 @@ int main (int argc, char ** argv)
    std::stringstream wssg,wssz;
    /// Point sinks
-    ssg<<config<< "_m" << m1 << "_m"<< m2 << "p_gf_meson.xml";
+    ssg<<config<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "p_z2_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
    MesonTrace(ssg.str(),GFProps[m1],GFProps[m2],phase);
    /// Wall sinks
-    wssg<<config<< "_m" << m1 << "_m"<< m2 << "w_gf_meson.xml";
+    wssg<<config<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
-    wssz<<config<< "_m" << m1 << "_m"<< m2 << "w_z2_meson.xml";
+    wssz<<config<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
    WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
    WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
--- a/systems/Summit/comms.4node
+++ b/systems/Summit/comms.4node
@ -0,0 +1,179 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200060000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 1073741824 byte stencil comms buffers 
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 Grid : Message : MemoryManager Cache 13529146982 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 2.137929 s : Grid is setup to use 6 threads
 Grid : Message : 2.137941 s : Number of iterations to average: 250
 Grid : Message : 2.137950 s : ====================================================================================================
 Grid : Message : 2.137958 s : = Benchmarking sequential halo exchange from host memory 
 Grid : Message : 2.137966 s : ====================================================================================================
 Grid : Message : 2.137974 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.604949 s :    8	8	     393216       89973.9  		179947.8
 Grid : Message : 2.668249 s :    8	8	     393216       18650.3  		37300.5
 Grid : Message : 2.732288 s :    8	8	     393216       18428.5  		36857.1
 Grid : Message : 2.753565 s :    8	8	     393216       55497.2  		110994.4
 Grid : Message : 2.808960 s :   12	8	    1327104       100181.5  		200363.0
 Grid : Message : 3.226900 s :   12	8	    1327104       20600.5  		41201.0
 Grid : Message : 3.167459 s :   12	8	    1327104       24104.6  		48209.2
 Grid : Message : 3.227660 s :   12	8	    1327104       66156.7  		132313.5
 Grid : Message : 3.413570 s :   16	8	    3145728       56174.4  		112348.8
 Grid : Message : 3.802697 s :   16	8	    3145728       24255.9  		48511.7
 Grid : Message : 4.190498 s :   16	8	    3145728       24336.7  		48673.4
 Grid : Message : 4.385171 s :   16	8	    3145728       48484.1  		96968.2
 Grid : Message : 4.805284 s :   20	8	    6144000       46380.5  		92761.1
 Grid : Message : 5.562975 s :   20	8	    6144000       24328.5  		48656.9
 Grid : Message : 6.322562 s :   20	8	    6144000       24266.7  		48533.4
 Grid : Message : 6.773598 s :   20	8	    6144000       40868.5  		81736.9
 Grid : Message : 7.600999 s :   24	8	   10616832       40198.3  		80396.6
 Grid : Message : 8.912917 s :   24	8	   10616832       24279.5  		48559.1
 Grid : Message : 10.220961 s :   24	8	   10616832       24350.2  		48700.4
 Grid : Message : 11.728250 s :   24	8	   10616832       37390.9  		74781.8
 Grid : Message : 12.497258 s :   28	8	   16859136       36792.2  		73584.5
 Grid : Message : 14.585387 s :   28	8	   16859136       24222.2  		48444.3
 Grid : Message : 16.664783 s :   28	8	   16859136       24323.4  		48646.8
 Grid : Message : 17.955238 s :   28	8	   16859136       39194.7  		78389.4
 Grid : Message : 20.136479 s :   32	8	   25165824       35718.3  		71436.5
 Grid : Message : 23.241958 s :   32	8	   25165824       24311.4  		48622.9
 Grid : Message : 26.344810 s :   32	8	   25165824       24331.9  		48663.7
 Grid : Message : 28.384420 s :   32	8	   25165824       37016.3  		74032.7
 Grid : Message : 28.388879 s : ====================================================================================================
 Grid : Message : 28.388894 s : = Benchmarking sequential halo exchange from GPU memory 
 Grid : Message : 28.388909 s : ====================================================================================================
 Grid : Message : 28.388924 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 Grid : Message : 28.553993 s :    8	8	     393216       8272.4  		16544.7
 Grid : Message : 28.679592 s :    8	8	     393216       9395.4  		18790.8
 Grid : Message : 28.811112 s :    8	8	     393216       8971.0  		17942.0
 Grid : Message : 28.843770 s :    8	8	     393216       36145.6  		72291.2
 Grid : Message : 28.981754 s :   12	8	    1327104       49591.6  		99183.2
 Grid : Message : 29.299764 s :   12	8	    1327104       12520.8  		25041.7
 Grid : Message : 29.620288 s :   12	8	    1327104       12422.2  		24844.4
 Grid : Message : 29.657645 s :   12	8	    1327104       106637.5  		213275.1
 Grid : Message : 29.952933 s :   16	8	    3145728       43939.2  		87878.5
 Grid : Message : 30.585411 s :   16	8	    3145728       14922.1  		29844.2
 Grid : Message : 31.219781 s :   16	8	    3145728       14877.2  		29754.4
 Grid : Message : 31.285017 s :   16	8	    3145728       144724.3  		289448.7
 Grid : Message : 31.706443 s :   20	8	    6144000       54676.2  		109352.4
 Grid : Message : 32.739205 s :   20	8	    6144000       17848.0  		35696.1
 Grid : Message : 33.771852 s :   20	8	    6144000       17849.9  		35699.7
 Grid : Message : 33.871981 s :   20	8	    6144000       184141.4  		368282.8
 Grid : Message : 34.536808 s :   24	8	   10616832       55784.3  		111568.6
 Grid : Message : 36.275648 s :   24	8	   10616832       18317.6  		36635.3
 Grid : Message : 37.997181 s :   24	8	   10616832       18501.7  		37003.4
 Grid : Message : 38.140442 s :   24	8	   10616832       222383.9  		444767.9
 Grid : Message : 39.177222 s :   28	8	   16859136       56609.7  		113219.4
 Grid : Message : 41.874755 s :   28	8	   16859136       18749.9  		37499.8
 Grid : Message : 44.529381 s :   28	8	   16859136       19052.9  		38105.8
 Grid : Message : 44.742192 s :   28	8	   16859136       237717.1  		475434.2
 Grid : Message : 46.184000 s :   32	8	   25165824       57091.2  		114182.4
 Grid : Message : 50.734740 s :   32	8	   25165824       19411.0  		38821.9
 Grid : Message : 53.931228 s :   32	8	   25165824       19570.6  		39141.2
 Grid : Message : 54.238467 s :   32	8	   25165824       245765.6  		491531.2
 Grid : Message : 54.268664 s : ====================================================================================================
 Grid : Message : 54.268680 s : = All done; Bye Bye
 Grid : Message : 54.268691 s : ====================================================================================================
--- a/systems/Summit/dwf.24.4node
+++ b/systems/Summit/dwf.24.4node
@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.731905 s : Grid Layout
 Grid : Message : 1.731915 s : 	Global lattice size  : 48 48 48 72 
 Grid : Message : 1.731928 s : 	OpenMP threads       : 6
 Grid : Message : 1.731938 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.683494 s : Making s innermost grids
 Grid : Message : 2.780034 s : Initialising 4d RNG
 Grid : Message : 2.833099 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.833121 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.916841 s : Initialising 5d RNG
 Grid : Message : 3.762880 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 3.762902 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 5.264345 s : Initialised RNGs
 Grid : Message : 6.489904 s : Drawing gauge field
 Grid : Message : 6.729262 s : Random gauge initialised 
 Grid : Message : 7.781273 s : Setting up Cshift based reference 
 Grid : Message : 8.725313 s : *****************************************************************
 Grid : Message : 8.725332 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 8.725342 s : *****************************************************************
 Grid : Message : 8.725352 s : *****************************************************************
 Grid : Message : 8.725362 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 8.725372 s : * Vectorising space-time by 4
 Grid : Message : 8.725383 s : * VComplexF size is 32 B
 Grid : Message : 8.725395 s : * SINGLE precision 
 Grid : Message : 8.725405 s : * Using Overlapped Comms/Compute
 Grid : Message : 8.725415 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 8.725425 s : *****************************************************************
 Grid : Message : 9.465229 s : Called warmup
 Grid : Message : 58.646066 s : Called Dw 3000 times in 4.91764e+07 us
 Grid : Message : 58.646121 s : mflop/s =   1.02592e+07
 Grid : Message : 58.646134 s : mflop/s per rank =  427468
 Grid : Message : 58.646145 s : mflop/s per node =  2.56481e+06
 Grid : Message : 58.646156 s : RF  GiB/s (base 2) =   20846.5
 Grid : Message : 58.646166 s : mem GiB/s (base 2) =   13029.1
 Grid : Message : 58.648008 s : norm diff   1.04778e-13
 Grid : Message : 58.734885 s : #### Dhop calls report 
 Grid : Message : 58.734897 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 58.734909 s : WilsonFermion5D TotalTime   /Calls        : 8217.71 us
 Grid : Message : 58.734922 s : WilsonFermion5D CommTime    /Calls        : 7109.5 us
 Grid : Message : 58.734933 s : WilsonFermion5D FaceTime    /Calls        : 446.623 us
 Grid : Message : 58.734943 s : WilsonFermion5D ComputeTime1/Calls        : 18.0558 us
 Grid : Message : 58.734953 s : WilsonFermion5D ComputeTime2/Calls        : 731.097 us
 Grid : Message : 58.734979 s : Average mflops/s per call                : 4.8157e+09
 Grid : Message : 58.734989 s : Average mflops/s per call per rank       : 2.00654e+08
 Grid : Message : 58.734999 s : Average mflops/s per call per node       : 1.20393e+09
 Grid : Message : 58.735008 s : Average mflops/s per call (full)         : 1.04183e+07
 Grid : Message : 58.735017 s : Average mflops/s per call per rank (full): 434094
 Grid : Message : 58.735026 s : Average mflops/s per call per node (full): 2.60456e+06
 Grid : Message : 58.735035 s : WilsonFermion5D Stencil
 Grid : Message : 58.735043 s : WilsonFermion5D StencilEven
 Grid : Message : 58.735051 s : WilsonFermion5D StencilOdd
 Grid : Message : 58.735059 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 58.735067 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 58.735075 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 64.934380 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 64.934740 s : Called DwDag
 Grid : Message : 64.934870 s : norm dag result 12.0422
 Grid : Message : 64.120756 s : norm dag ref    12.0422
 Grid : Message : 64.149389 s : norm dag diff   7.6644e-14
 Grid : Message : 64.317786 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 64.465331 s : src_e0.499995
 Grid : Message : 64.524653 s : src_o0.500005
 Grid : Message : 64.558706 s : *********************************************************
 Grid : Message : 64.558717 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 64.558727 s : * Vectorising space-time by 4
 Grid : Message : 64.558737 s : * SINGLE precision 
 Grid : Message : 64.558745 s : * Using Overlapped Comms/Compute
 Grid : Message : 64.558753 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 64.558761 s : *********************************************************
 Grid : Message : 92.702145 s : Deo mflop/s =   8.97692e+06
 Grid : Message : 92.702185 s : Deo mflop/s per rank   374038
 Grid : Message : 92.702198 s : Deo mflop/s per node   2.24423e+06
 Grid : Message : 92.702209 s : #### Dhop calls report 
 Grid : Message : 92.702223 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 92.702240 s : WilsonFermion5D TotalTime   /Calls        : 9377.88 us
 Grid : Message : 92.702257 s : WilsonFermion5D CommTime    /Calls        : 8221.84 us
 Grid : Message : 92.702277 s : WilsonFermion5D FaceTime    /Calls        : 543.548 us
 Grid : Message : 92.702301 s : WilsonFermion5D ComputeTime1/Calls        : 20.936 us
 Grid : Message : 92.702322 s : WilsonFermion5D ComputeTime2/Calls        : 732.33 us
 Grid : Message : 92.702376 s : Average mflops/s per call                : 4.13001e+09
 Grid : Message : 92.702387 s : Average mflops/s per call per rank       : 1.72084e+08
 Grid : Message : 92.702397 s : Average mflops/s per call per node       : 1.0325e+09
 Grid : Message : 92.702407 s : Average mflops/s per call (full)         : 9.12937e+06
 Grid : Message : 92.702416 s : Average mflops/s per call per rank (full): 380391
 Grid : Message : 92.702426 s : Average mflops/s per call per node (full): 2.28234e+06
 Grid : Message : 92.702435 s : WilsonFermion5D Stencil
 Grid : Message : 92.702443 s : WilsonFermion5D StencilEven
 Grid : Message : 92.702451 s : WilsonFermion5D StencilOdd
 Grid : Message : 92.702459 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 92.702467 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 92.702475 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 92.772983 s : r_e6.02121
 Grid : Message : 92.786384 s : r_o6.02102
 Grid : Message : 92.799622 s : res12.0422
 Grid : Message : 93.860500 s : norm diff   0
 Grid : Message : 93.162026 s : norm diff even  0
 Grid : Message : 93.197529 s : norm diff odd   0
--- a/systems/Summit/dwf.32.4node
+++ b/systems/Summit/dwf.32.4node
@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.544984 s : Grid Layout
 Grid : Message : 1.544992 s : 	Global lattice size  : 64 64 64 96 
 Grid : Message : 1.545003 s : 	OpenMP threads       : 6
 Grid : Message : 1.545011 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.994920 s : Making s innermost grids
 Grid : Message : 2.232502 s : Initialising 4d RNG
 Grid : Message : 2.397047 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.397069 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.653140 s : Initialising 5d RNG
 Grid : Message : 5.285347 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 5.285369 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 9.994738 s : Initialised RNGs
 Grid : Message : 13.153426 s : Drawing gauge field
 Grid : Message : 13.825697 s : Random gauge initialised 
 Grid : Message : 18.537657 s : Setting up Cshift based reference 
 Grid : Message : 22.296755 s : *****************************************************************
 Grid : Message : 22.296781 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 22.296791 s : *****************************************************************
 Grid : Message : 22.296800 s : *****************************************************************
 Grid : Message : 22.296809 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 22.296818 s : * Vectorising space-time by 4
 Grid : Message : 22.296828 s : * VComplexF size is 32 B
 Grid : Message : 22.296838 s : * SINGLE precision 
 Grid : Message : 22.296847 s : * Using Overlapped Comms/Compute
 Grid : Message : 22.296855 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 22.296863 s : *****************************************************************
 Grid : Message : 24.746452 s : Called warmup
 Grid : Message : 137.525756 s : Called Dw 3000 times in 1.12779e+08 us
 Grid : Message : 137.525818 s : mflop/s =   1.41383e+07
 Grid : Message : 137.525831 s : mflop/s per rank =  589097
 Grid : Message : 137.525843 s : mflop/s per node =  3.53458e+06
 Grid : Message : 137.525854 s : RF  GiB/s (base 2) =   28728.7
 Grid : Message : 137.525864 s : mem GiB/s (base 2) =   17955.5
 Grid : Message : 137.693645 s : norm diff   1.04885e-13
 Grid : Message : 137.965585 s : #### Dhop calls report 
 Grid : Message : 137.965598 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 137.965612 s : WilsonFermion5D TotalTime   /Calls        : 18899.7 us
 Grid : Message : 137.965624 s : WilsonFermion5D CommTime    /Calls        : 16041.4 us
 Grid : Message : 137.965634 s : WilsonFermion5D FaceTime    /Calls        : 859.705 us
 Grid : Message : 137.965644 s : WilsonFermion5D ComputeTime1/Calls        : 70.5881 us
 Grid : Message : 137.965654 s : WilsonFermion5D ComputeTime2/Calls        : 2094.8 us
 Grid : Message : 137.965682 s : Average mflops/s per call                : 3.87638e+09
 Grid : Message : 137.965692 s : Average mflops/s per call per rank       : 1.61516e+08
 Grid : Message : 137.965702 s : Average mflops/s per call per node       : 9.69095e+08
 Grid : Message : 137.965712 s : Average mflops/s per call (full)         : 1.43168e+07
 Grid : Message : 137.965721 s : Average mflops/s per call per rank (full): 596533
 Grid : Message : 137.965730 s : Average mflops/s per call per node (full): 3.5792e+06
 Grid : Message : 137.965740 s : WilsonFermion5D Stencil
 Grid : Message : 137.965748 s : WilsonFermion5D StencilEven
 Grid : Message : 137.965756 s : WilsonFermion5D StencilOdd
 Grid : Message : 137.965764 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 137.965772 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 137.965780 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 156.554605 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 156.554632 s : Called DwDag
 Grid : Message : 156.554642 s : norm dag result 12.0421
 Grid : Message : 156.639265 s : norm dag ref    12.0421
 Grid : Message : 156.888281 s : norm dag diff   7.62057e-14
 Grid : Message : 157.609797 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 158.208630 s : src_e0.499996
 Grid : Message : 158.162447 s : src_o0.500004
 Grid : Message : 158.267780 s : *********************************************************
 Grid : Message : 158.267791 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 158.267801 s : * Vectorising space-time by 4
 Grid : Message : 158.267811 s : * SINGLE precision 
 Grid : Message : 158.267820 s : * Using Overlapped Comms/Compute
 Grid : Message : 158.267828 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 158.267836 s : *********************************************************
 Grid : Message : 216.487829 s : Deo mflop/s =   1.37283e+07
 Grid : Message : 216.487869 s : Deo mflop/s per rank   572011
 Grid : Message : 216.487881 s : Deo mflop/s per node   3.43206e+06
 Grid : Message : 216.487893 s : #### Dhop calls report 
 Grid : Message : 216.487903 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 216.487913 s : WilsonFermion5D TotalTime   /Calls        : 19399.6 us
 Grid : Message : 216.487923 s : WilsonFermion5D CommTime    /Calls        : 16475.4 us
 Grid : Message : 216.487933 s : WilsonFermion5D FaceTime    /Calls        : 972.393 us
 Grid : Message : 216.487943 s : WilsonFermion5D ComputeTime1/Calls        : 49.8474 us
 Grid : Message : 216.487953 s : WilsonFermion5D ComputeTime2/Calls        : 2089.93 us
 Grid : Message : 216.488001 s : Average mflops/s per call                : 5.39682e+09
 Grid : Message : 216.488011 s : Average mflops/s per call per rank       : 2.24867e+08
 Grid : Message : 216.488020 s : Average mflops/s per call per node       : 1.3492e+09
 Grid : Message : 216.488030 s : Average mflops/s per call (full)         : 1.39479e+07
 Grid : Message : 216.488039 s : Average mflops/s per call per rank (full): 581162
 Grid : Message : 216.488048 s : Average mflops/s per call per node (full): 3.48697e+06
 Grid : Message : 216.488057 s : WilsonFermion5D Stencil
 Grid : Message : 216.488065 s : WilsonFermion5D StencilEven
 Grid : Message : 216.488073 s : WilsonFermion5D StencilOdd
 Grid : Message : 216.488081 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 216.488089 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 216.488097 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 217.384495 s : r_e6.02113
 Grid : Message : 217.426121 s : r_o6.02096
 Grid : Message : 217.472636 s : res12.0421
 Grid : Message : 218.200068 s : norm diff   0
 Grid : Message : 218.645673 s : norm diff even  0
 Grid : Message : 218.816561 s : norm diff odd   0
--- a/systems/Summit/dwf16.lsf
+++ b/systems/Summit/dwf16.lsf
@ -1,7 +1,7 @@
 #!/bin/bash
 #BSUB -P LGT104
 #BSUB -W 2:00
-#BSUB -nnodes 4
+#BSUB -nnodes 16
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
@ -9,14 +9,14 @@ export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
-APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
+APP="./benchmarks/Benchmark_comms_host_device  --mpi 4.4.4.3 "
-jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log
-APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
-jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log
-APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
-jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log
--- a/systems/Summit/dwf4.lsf
+++ b/systems/Summit/dwf4.lsf
@ -10,13 +10,13 @@ export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
 #export GRID_ALLOC_NCACHE_LARGE=1
 export APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
-jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.4node
-APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
-jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.24.4node
-APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
-jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.32.4node
--- a/systems/Tursa/config-command
+++ b/systems/Tursa/config-command
@ -5,7 +5,7 @@
    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
    --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
-    --disable-accelerator-cshift \
+    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
--- a/systems/Tursa/sourceme.sh
+++ b/systems/Tursa/sourceme.sh
@ -1,2 +1,6 @@
-spack load c-lime
+module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1  
-module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
+#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
 export PREFIX=/home/tc002/tc002/shared/env/prefix/
 export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
 unset SBATCH_EXPORT
--- a/tests/core/Test_contfrac_even_odd.cc
+++ b/tests/core/Test_contfrac_even_odd.cc
@ -235,7 +235,6 @@ void  TestWhat(What & Ddwf,
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_dwf_eofa_even_odd.cc
+++ b/tests/core/Test_dwf_eofa_even_odd.cc
@ -215,7 +215,6 @@ int main (int argc, char ** argv)
    pickCheckerboard(Odd , chi_o, chi);
    pickCheckerboard(Even, phi_e, phi);
    pickCheckerboard(Odd , phi_o, phi);
    RealD t1,t2;
    SchurDiagMooeeOperator<DomainWallEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
    HermOpEO.MpcDagMpc(chi_e, dchi_e);
--- a/tests/core/Test_dwf_even_odd.cc
+++ b/tests/core/Test_dwf_even_odd.cc
@ -212,8 +212,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_gamma.cc
+++ b/tests/core/Test_gamma.cc
@ -181,8 +181,8 @@ void checkAdj(const Gamma::Algebra a)
 void checkProject(GridSerialRNG &rng)
 {
-  SpinVector     rv, recon, full;
+  SpinVector     rv, recon;
-  HalfSpinVector hsp, hsm;
+  HalfSpinVector hsm;
  random(rng, rv);
--- a/tests/core/Test_gpwilson_even_odd.cc
+++ b/tests/core/Test_gpwilson_even_odd.cc
@ -198,7 +198,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<GparityWilsonFermionR,FermionField> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@ -364,14 +364,12 @@ int main(int argc, char **argv) {
      {  // Peek-ology and Poke-ology, with a little app-ology
        Complex c;
-        ColourMatrix c_m;
+        ColourMatrix c_m = Zero();
-        SpinMatrix s_m;
+        SpinMatrix s_m = Zero();
-        SpinColourMatrix sc_m;
+        SpinColourMatrix sc_m = Zero();
-        s_m = TensorIndexRecursion<ColourIndex>::traceIndex(
+        s_m = TensorIndexRecursion<ColourIndex>::traceIndex(sc_m);  // Map to traceColour
-            sc_m);  // Map to traceColour
+        c_m = TensorIndexRecursion<SpinIndex>::traceIndex(sc_m);  // map to traceSpin
        c_m = TensorIndexRecursion<SpinIndex>::traceIndex(
            sc_m);  // map to traceSpin
        c = TensorIndexRecursion<SpinIndex>::traceIndex(s_m);
        c = TensorIndexRecursion<ColourIndex>::traceIndex(c_m);
--- a/tests/core/Test_mobius_eofa_even_odd.cc
+++ b/tests/core/Test_mobius_eofa_even_odd.cc
@ -217,7 +217,6 @@ int main (int argc, char ** argv)
    pickCheckerboard(Odd , chi_o, chi);
    pickCheckerboard(Even, phi_e, phi);
    pickCheckerboard(Odd , phi_o, phi);
    RealD t1,t2;
    SchurDiagMooeeOperator<MobiusEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
    HermOpEO.MpcDagMpc(chi_e, dchi_e);
--- a/tests/core/Test_mobius_even_odd.cc
+++ b/tests/core/Test_mobius_even_odd.cc
@ -262,7 +262,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<MobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
--- a/tests/core/Test_staggered.cc
+++ b/tests/core/Test_staggered.cc
@ -144,7 +144,7 @@ int main (int argc, char ** argv)
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
-  double t2;
+
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
--- a/tests/core/Test_staggered5D.cc
+++ b/tests/core/Test_staggered5D.cc
@ -162,7 +162,6 @@ int main (int argc, char ** argv)
  }
  double t1=usecond();
  double t2;
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
--- a/tests/core/Test_staggered_naive.cc
+++ b/tests/core/Test_staggered_naive.cc
@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 ;
 int main (int argc, char ** argv)
 {
@ -135,7 +134,6 @@ int main (int argc, char ** argv)
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
  double t2;
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
--- a/tests/core/Test_wilson_even_odd.cc
+++ b/tests/core/Test_wilson_even_odd.cc
@ -204,7 +204,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_wilson_twisted_mass_even_odd.cc
+++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc
@ -205,7 +205,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
--- a/tests/core/Test_zmobius_even_odd.cc
+++ b/tests/core/Test_zmobius_even_odd.cc
@ -276,7 +276,6 @@ int main (int argc, char ** argv)
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<ZMobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@ -57,7 +57,6 @@ int main (int argc, char ** argv)
  SU<Nc>::HotConfiguration(pRNG,U);
  double beta = 1.0;
  double c1   = -0.331;
  IwasakiGaugeActionR Action(beta);
  //  PlaqPlusRectangleActionR Action(beta,c1);
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
@ -40,6 +40,7 @@ using namespace Grid;
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@ -67,6 +68,8 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator ();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@ -108,6 +110,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_16_rb.cc
+++ b/tests/solver/Test_dwf_hdcr_16_rb.cc
@ -56,9 +56,9 @@ template<class Field> class SolverWrapper : public LinearFunction<Field> {
 private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
-public:
+public: 
-
+  using LinearFunction<Field>::operator();
-  /////////////////////////////////////////////////////
+ /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
  SolverWrapper(CheckerBoardedSparseMatrixBase<Field> &Matrix,
@ -75,6 +75,7 @@ public:
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -98,6 +99,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@ -128,6 +130,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_24_regression.cc
+++ b/tests/solver/Test_dwf_hdcr_24_regression.cc
@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@ -108,6 +110,8 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_2level.cc
+++ b/tests/solver/Test_dwf_hdcr_2level.cc
@ -56,6 +56,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -79,6 +80,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@ -108,6 +110,7 @@ public:
 template<class Field,class Matrix> class RedBlackSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  RealD tol;
@ -134,6 +137,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
@ -241,7 +245,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
  const int Ls=16;
-  const int rLs=8;
+  //  const int rLs=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -388,7 +392,7 @@ int main (int argc, char ** argv)
  //  RedBlackSmoother<LatticeFermion,DomainWallFermionR> FineRBSmoother(0.00,0.001,100,Ddwf);
  // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  TwoLevelMG TwoLevelPrecon(Aggregates, LDOp,
 			    HermIndefOp,Ddwf,
 			    FineSmoother,
--- a/tests/solver/Test_dwf_hdcr_48_rb.cc
+++ b/tests/solver/Test_dwf_hdcr_48_rb.cc
@ -57,7 +57,7 @@ private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@ -75,6 +75,7 @@ public:
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -98,6 +99,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@ -128,6 +130,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_hdcr_48_regression.cc
+++ b/tests/solver/Test_dwf_hdcr_48_regression.cc
@ -55,6 +55,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -78,6 +79,7 @@ public:
 template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & SmootherMatrix;
  FineOperator   & SmootherOperator;
@ -108,6 +110,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class Matrix, class Guesser, class CoarseSolver>
 class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_dwf_multigrid.cc
+++ b/tests/solver/Test_dwf_multigrid.cc
@ -57,6 +57,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
@ -118,6 +119,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -174,6 +176,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
--- a/tests/solver/Test_hw_multigrid_mixed_48.cc
+++ b/tests/solver/Test_hw_multigrid_mixed_48.cc
@ -456,8 +456,8 @@ public:
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //    int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    autoView(in_v ,   in, AcceleratorRead);
    autoView(st, Stencil, AcceleratorRead);
@ -471,7 +471,7 @@ public:
 	  typedef decltype(coalescedRead(in_v[0])) calcVector;
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int sU = sF/Ls;
-	  int  s = sF%Ls;
+	  //	  int  s = sF%Ls;
 	  calcComplex res = Zero();
 	  calcVector  nbr;
@ -517,14 +517,14 @@ public:
    autoView(st, Stencil, AcceleratorRead);
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //    int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
      typedef decltype(coalescedRead(in_v[0])) calcVector;
      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
      int sU = sF/Ls;
-      int  s = sF%Ls;
+      //      int  s = sF%Ls;
      calcComplex res = Zero();
@ -650,7 +650,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@ -712,6 +712,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -735,6 +736,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@ -831,6 +833,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@ -1174,18 +1177,18 @@ int main (int argc, char ** argv)
  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
  int cNconv;
  cNm=0;
  std::vector<RealD>          eval2(cNm);
  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
  cc_src=1.0;
  //  int cNconv;
  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);
  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(0.02,10000);
  DeflatedGuesser<CoarseCoarseVector> DeflCoarseCoarseGuesser(evec2,eval2);
  NormalEquations<CoarseCoarseVector> DeflCoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,DeflCoarseCoarseGuesser);
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
--- a/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
+++ b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
@ -456,8 +456,8 @@ public:
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    autoView(in_v ,   in, AcceleratorRead);
    autoView(st, Stencil, AcceleratorRead);
@ -471,7 +471,7 @@ public:
 	  typedef decltype(coalescedRead(in_v[0])) calcVector;
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int sU = sF/Ls;
-	  int  s = sF%Ls;
+	  //	  int  s = sF%Ls;
 	  calcComplex res = Zero();
 	  calcVector  nbr;
@ -517,14 +517,14 @@ public:
    autoView(st, Stencil, AcceleratorRead);
    siteVector *CBp=Stencil.CommBuf();			
-    int ptype;
+    //    int ptype;
-    int nb2=nbasis/2;
+    //    int nb2=nbasis/2;
    accelerator_for2d(sF, Coarse5D->oSites(), b, nbasis, Nsimd, {
      typedef decltype(coalescedRead(in_v[0])) calcVector;
      typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
      int sU = sF/Ls;
-      int  s = sF%Ls;
+      //      int  s = sF%Ls;
      calcComplex res = Zero();
@ -648,7 +648,7 @@ private:
  CheckerBoardedSparseMatrixBase<Field> & _Matrix;
  SchurRedBlackBase<Field> & _Solver;
 public:
-
+  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
@ -669,6 +669,7 @@ private:
  OperatorFunction<Field> & _Solver;
  LinearFunction<Field>   & _Guess;
 public:
  using LinearFunction<Field>::operator();
  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
@ -731,6 +732,7 @@ RealD InverseApproximation(RealD x){
 template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
  typedef LinearOperatorBase<Field>                            FineOperator;
  Matrix         & _SmootherMatrix;
  FineOperator   & _SmootherOperator;
@ -754,6 +756,7 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
@ -850,7 +853,8 @@ public:
 template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
 class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
-
+  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
@ -1194,11 +1198,11 @@ int main (int argc, char ** argv)
  PlainHermOp<CoarseCoarseVector> IRLOpL2    (IRLHermOpL2);
  ImplicitlyRestartedLanczos<CoarseCoarseVector> IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20);
  int cNconv;
  cNm=0;
  std::vector<RealD>          eval2(cNm);
  std::vector<CoarseCoarseVector>   evec2(cNm,CoarseCoarse5d);
  cc_src=1.0;
  //  int cNconv;
  //  IRLL2.calc(eval2,evec2,cc_src,cNconv);
  std::vector<RealD> tols ({0.005,0.001});
@ -1218,10 +1222,10 @@ int main (int argc, char ** argv)
  for(auto c_hi : c_his ) {
  for(auto f_lo : f_los ) {
  for(auto f_hi : f_his ) {
-  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+    //  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
-  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
+    //  ZeroGuesser<CoarseCoarseVector>       CoarseCoarseZeroGuesser;
  ConjugateGradient<CoarseCoarseVector>  CoarseCoarseCG(tol,10000);
-  ZeroGuesser<CoarseCoarseVector> CoarseCoarseGuesser;
+  //  ZeroGuesser<CoarseCoarseVector> CoarseCoarseGuesser;
  SchurRedBlackDiagMooeeSolve<CoarseCoarseVector> CoarseCoarseRBCG(CoarseCoarseCG);
  SchurSolverWrapper<CoarseCoarseVector> CoarseCoarseSolver(cc_Dwf,CoarseCoarseRBCG);
--- a/tests/solver/Test_multigrid_common.h
+++ b/tests/solver/Test_multigrid_common.h
@ -143,6 +143,7 @@ public:
 template<class Field> class MultiGridPreconditionerBase : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual ~MultiGridPreconditionerBase()               = default;
  virtual void setup()                                 = 0;
  virtual void operator()(Field const &in, Field &out) = 0;
@ -156,6 +157,7 @@ public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
  using MultiGridPreconditionerBase<Lattice<Fobj>>::operator();
  // clang-format off
  typedef Aggregation<Fobj, CComplex, nBasis>                                                                         Aggregates;
@ -568,6 +570,7 @@ public:
  /////////////////////////////////////////////
  // Type Definitions
  /////////////////////////////////////////////
  using MultiGridPreconditionerBase<Lattice<Fobj>>::operator();
  typedef Matrix        FineDiracMatrix;
  typedef Lattice<Fobj> FineVector;
--- a/tests/solver/Test_wilson_qmr_unprec.cc
+++ b/tests/solver/Test_wilson_qmr_unprec.cc
@ -56,7 +56,6 @@ int main (int argc, char ** argv)
  QuasiMinimalResidual<LatticeFermion> QMR(1.0e-8,10000);
  RealD mass=0.0;
  RealD M5=1.8;
  WilsonFermionR Dw(Umu,*Grid,*rbGrid,mass);
  NonHermitianLinearOperator<WilsonFermionR,LatticeFermion> NonHermOp(Dw);