First cut at faster GPU slice sum via atomics

2025-06-23 10:12:02 +01:00 · 2022-12-22 15:13:45 -05:00
493 changed files with 7184 additions and 36554 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -1,54 +0,0 @@
 name: Bug report
 description: Report a bug.
 title: "<insert title>"
 labels: [bug]
 body:
  - type: markdown
    attributes:
      value: >
        Thank you for taking the time to file a bug report.
        Please check that the code is pointing to the HEAD of develop
        or any commit in master which is tagged with a version number.
  - type: textarea
    attributes:
      label: "Describe the issue:"
      description: >
        Describe the issue and any previous attempt to solve it.
    validations:
      required: true
  - type: textarea
    attributes:
      label: "Code example:"
      description: >
        If relevant, show how to reproduce the issue using a minimal working
        example.
      placeholder: |
        << your code here >>
      render: shell
    validations:
      required: false
  - type: textarea
    attributes:
      label: "Target platform:"
      description: >
        Give a description of the target platform (CPU, network, compiler).
        Please give the full CPU part description, using for example
        `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
        or `sysctl machdep.cpu.brand_string` (macOS) and the full output
        the `--version` option of your compiler.
    validations:
      required: true
  - type: textarea
    attributes:
      label: "Configure options:"
      description: >
        Please give the exact configure command used and attach
        `config.log`, `grid.config.summary` and the output of `make V=1`.
      render: shell
    validations:
      required: true
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@ -45,7 +45,7 @@ directory
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@ -44,10 +44,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
-//#include <Grid/perfmon/PerfCount.h>
+#include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
 #include <Grid/perfmon/Tracing.h>
 #include <Grid/allocator/Allocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/ThreadReduction.h>
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@ -36,7 +36,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@ -14,7 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@ -66,10 +66,6 @@ if BUILD_FERMION_REPS
  extra_sources+=$(ADJ_FERMION_FILES)
  extra_sources+=$(TWOIND_FERMION_FILES)
 endif
 if BUILD_SP
    extra_sources+=$(SP_FERMION_FILES)
    extra_sources+=$(SP_TWOIND_FERMION_FILES)
 endif
 lib_LIBRARIES = libGrid.a
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -54,8 +54,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@ -69,8 +67,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/PowerMethod.h>
 NAMESPACE_CHECK(PowerMethod);
-#include <Grid/algorithms/multigrid/MultiGrid.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>
 NAMESPACE_CHECK(CoarsendMatrix);
 #include <Grid/algorithms/FFT.h>
--- a/Grid/algorithms/multigrid/CoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h
@ -56,6 +56,243 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
  blockSum(CoarseInner,fine_inner_msk);
 }
 class Geometry {
 public:
  int npoint;
  int base;
  std::vector<int> directions   ;
  std::vector<int> displacements;
  std::vector<int> points_dagger;
  Geometry(int _d)  {
    base = (_d==5) ? 1:0;
    // make coarse grid stencil for 4d , not 5d
    if ( _d==5 ) _d=4;
    npoint = 2*_d+1;
    directions.resize(npoint);
    displacements.resize(npoint);
    points_dagger.resize(npoint);
    for(int d=0;d<_d;d++){
      directions[d   ] = d+base;
      directions[d+_d] = d+base;
      displacements[d  ] = +1;
      displacements[d+_d]= -1;
      points_dagger[d   ] = d+_d;
      points_dagger[d+_d] = d;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
    points_dagger[2*_d]=2*_d;
  }
  int point(int dir, int disp) {
    assert(disp == -1 || disp == 0 || disp == 1);
    assert(base+0 <= dir && dir < base+4);
    // directions faster index = new indexing
    // 4d (base = 0):
    // point 0  1  2  3  4  5  6  7  8
    // dir   0  1  2  3  0  1  2  3  0
    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
    // 5d (base = 1):
    // point 0  1  2  3  4  5  6  7  8
    // dir   1  2  3  4  1  2  3  4  0
    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
    // displacements faster index = old indexing
    // 4d (base = 0):
    // point 0  1  2  3  4  5  6  7  8
    // dir   0  0  1  1  2  2  3  3  0
    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
    // 5d (base = 1):
    // point 0  1  2  3  4  5  6  7  8
    // dir   1  1  2  2  3  3  4  4  0
    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
    if(dir == 0 and disp == 0)
      return 8;
    else // New indexing
      return (1 - disp) / 2 * 4 + dir - base;
    // else // Old indexing
    //   return (4 * (dir - base) + 1 - disp) / 2;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class Aggregation   {
 public:
  typedef iVector<CComplex,nbasis >             siteVector;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
  GridBase *CoarseGrid;
  GridBase *FineGrid;
  std::vector<Lattice<Fobj> > subspace;
  int checkerboard;
  int Checkerboard(void){return checkerboard;}
  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
    CoarseGrid(_CoarseGrid),
    FineGrid(_FineGrid),
    subspace(nbasis,_FineGrid),
    checkerboard(_checkerboard)
  {
  };
  void Orthogonalise(void){
    CoarseScalar InnerProd(CoarseGrid); 
    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
  } 
  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
    blockProject(CoarseVec,FineVec,subspace);
  }
  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
    FineVec.Checkerboard() = subspace[0].Checkerboard();
    blockPromote(CoarseVec,FineVec,subspace);
  }
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
    RealD scale;
    ConjugateGradient<FineField> CG(1.0e-2,100,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    for(int b=0;b<nn;b++){
      subspace[b] = Zero();
      gaussian(RNG,noise);
      scale = std::pow(norm2(noise),-0.5); 
      noise=noise*scale;
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
      for(int i=0;i<1;i++){
 	CG(hermop,noise,subspace[b]);
 	noise = subspace[b];
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
      }
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
      subspace[b]   = noise;
    }
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
  ////////////////////////////////////////////////////////////////////////////////////////////////
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter,
 				       int ordermin,
 				       int orderstep,
 				       double filterlo
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    // Initial matrix element
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
    {
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
      b++;
    }
    // Generate a full sequence of Chebyshevs
    {
      lo=filterlo;
      noise=Mn;
      FineField T0(FineGrid); T0 = noise;  
      FineField T1(FineGrid); 
      FineField T2(FineGrid);
      FineField y(FineGrid);
      FineField *Tnm = &T0;
      FineField *Tn  = &T1;
      FineField *Tnp = &T2;
      // Tn=T1 = (xscale M + mscale)in
      RealD xscale = 2.0/(hi-lo);
      RealD mscale = -(hi+lo)/(hi-lo);
      hermop.HermOp(T0,y);
      T1=y*xscale+noise*mscale;
      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
 	hermop.HermOp(*Tn,y);
 	autoView( y_v , y, AcceleratorWrite);
 	autoView( Tn_v , (*Tn), AcceleratorWrite);
 	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
 	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
 	// Possible more fine grained control is needed than a linear sweep,
 	// but huge productivity gain if this is simple algorithm and not a tunable
 	int m =1;
 	if ( n>=ordermin ) m=n-ordermin;
 	if ( (m%orderstep)==0 ) { 
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
 	  hermop.Op(Mn,tmp); 
 	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
 	  b++;
 	}
 	// Cycle pointers to avoid copies
 	FineField *swizzle = Tnm;
 	Tnm    =Tn;
 	Tn     =Tnp;
 	Tnp    =swizzle;
      }
    }
    assert(b==nn);
  }
 };
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
@ -87,9 +324,9 @@ public:
  GridBase*        _cbgrid;
  int hermitian;
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil; 
+  CartesianStencil<siteVector,siteVector,int> Stencil; 
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
+  CartesianStencil<siteVector,siteVector,int> StencilEven;
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
+  CartesianStencil<siteVector,siteVector,int> StencilOdd;
  std::vector<CoarseMatrix> A;
  std::vector<CoarseMatrix> Aeven;
@ -394,7 +631,7 @@ public:
    assert(Aself != nullptr);
  }
-  void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
+  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
                       const CoarseVector &in, CoarseVector &out, int dag) {
    int point = geom.npoint-1;
    autoView( out_v, out, AcceleratorWrite);
@ -457,7 +694,7 @@ public:
    }
  }
-  void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
+  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
                    const CoarseVector &in, CoarseVector &out, int dag) {
    SimpleCompressor<siteVector> compressor;
@ -547,9 +784,9 @@ public:
    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,_cbgrid),
    Aodd(geom.npoint,_cbgrid),
@ -567,9 +804,9 @@ public:
    _cbgrid(&CoarseRBGrid),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
+    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
+    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,&CoarseRBGrid),
    Aodd(geom.npoint,&CoarseRBGrid),
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -526,7 +526,6 @@ public:
      (*this)(Linop,in[k],out[k]);
    }
  };
  virtual ~OperatorFunction(){};
 };
 template<class Field> class LinearFunction {
@ -542,7 +541,6 @@ public:
      (*this)(in[i], out[i]);
    }
  }
  virtual ~LinearFunction(){};
 };
 template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -90,8 +90,9 @@ public:
    order=_order;
    if(order < 2) exit(-1);
-    Coeffs.resize(order,0.0);
+    Coeffs.resize(order);
-    Coeffs[order-1] = 1.0;
+    Coeffs.assign(0.,order);
    Coeffs[order-1] = 1.;
  };
  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
@ -257,12 +258,26 @@ public:
    for(int n=2;n<order;n++){
      Linop.HermOp(*Tn,y);
 #if 0
      auto y_v = y.View();
      auto Tn_v = Tn->View();
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
      accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
 #else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
-
+#endif
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@ -33,254 +33,218 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * Script A = SolverMatrix 
   * Script P = Preconditioner
   *
   * Deflation methods considered
   *      -- Solve P A x = P b        [ like Luscher ]
   * DEF-1        M P A x = M P b     [i.e. left precon]
   * DEF-2        P^T M A x = P^T M b
   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
   * ADEF-2       Preconditioner = P^T M + Q
   * BNN          Preconditioner = P^T M P + Q
   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
   * 
   * Implement ADEF-2
   *
   * Vstart = P^Tx + Qb
   * M1 = P^TM + Q
   * M2=M3=1
   * Vout = x
   */
 NAMESPACE_BEGIN(Grid);
-template<class Field>
+// abstract base
-class TwoLevelCG : public LinearFunction<Field>
+template<class Field, class CoarseField>
 class TwoLevelFlexiblePcg : public LinearFunction<Field>
 {
 public:
  int verbose;
  RealD   Tolerance;
  Integer MaxIterations;
  const int mmax = 5;
  GridBase *grid;
  GridBase *coarsegrid;
-  // Fine operator, Smoother, CoarseSolver
+  LinearOperatorBase<Field>   *_Linop
-  LinearOperatorBase<Field>   &_FineLinop;
+  OperatorFunction<Field>     *_Smoother,
-  LinearFunction<Field>   &_Smoother;
+  LinearFunction<CoarseField> *_CoarseSolver;
  // Need somthing that knows how to get from Coarse to fine and back again
  // more most opertor functions
-  TwoLevelCG(RealD tol,
+  TwoLevelFlexiblePcg(RealD tol,
-	     Integer maxit,
+		     Integer maxit,
-	     LinearOperatorBase<Field>   &FineLinop,
+		     LinearOperatorBase<Field> *Linop,
-	     LinearFunction<Field>       &Smoother,
+		     LinearOperatorBase<Field> *SmootherLinop,
-	     GridBase *fine) : 
+		     OperatorFunction<Field>   *Smoother,
 		     OperatorFunction<CoarseField>  CoarseLinop
 		     ) : 
      Tolerance(tol), 
      MaxIterations(maxit),
-      _FineLinop(FineLinop),
+      _Linop(Linop),
-      _Smoother(Smoother)
+      _PreconditionerLinop(PrecLinop),
-  {
+      _Preconditioner(Preconditioner)
-    grid       = fine;
+  { 
    verbose=0;
  };
-  
+
-  virtual void operator() (const Field &src, Field &x)
+  // The Pcg routine is common to all, but the various matrices differ from derived 
-  {
+  // implementation to derived implmentation
-    Field resid(grid);
+  void operator() (const Field &src, Field &psi){
  void operator() (const Field &src, Field &psi){
    psi.Checkerboard() = src.Checkerboard();
    grid             = src.Grid();
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
    RealD tn;
    RealD guess = norm2(psi);
    RealD ssq   = norm2(src);
    RealD rsq   = ssq*Tolerance*Tolerance;
-    Field p(grid);
+    /////////////////////////////
-    Field z(grid);
+    // Set up history vectors
    /////////////////////////////
    std::vector<Field> p  (mmax,grid);
    std::vector<Field> mmp(mmax,grid);
    std::vector<RealD> pAp(mmax);
    Field x  (grid); x = psi;
    Field z  (grid);
    Field tmp(grid);
    Field mmp(grid);
    Field r  (grid);
    Field mu (grid);
-    Field rp (grid);
+  
    //Initial residual computation & set up
    double tn;
    GridStopWatch HDCGTimer;
    HDCGTimer.Start();
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
-    x=Zero();
+    x=src;
    Vstart(x,src);
    // r0 = b -A x0
-    _FineLinop.HermOp(x,mmp);
+    HermOp(x,mmp); // Shouldn't this be something else?
-
+    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
    axpy(r, -1.0, mmp, src);    // Recomputes r=src-x0
    rp=r;
    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
-    PcgM1(r,z);
+    M1(r,z,tmp,mp,SmootherMirs);
    rtzp =real(innerProduct(r,z));
    ///////////////////////////////////////
-    // Except Def2, M2 is trivial
+    // Solve for Mss mu = P A z and set p = z-mu
    // Def2: p = 1 - Q Az = Pright z 
    // Other algos M2 is trivial
    ///////////////////////////////////////
-    p=z;
+    M2(z,p[0]);
-    RealD ssq =  norm2(src);
+    for (int k=0;k<=MaxIterations;k++){
    RealD rsq =  ssq*Tolerance*Tolerance;
    std::cout<<GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" target rsq "<<rsq<<" ssq "<<ssq<<std::endl;
-    for (int k=1;k<=MaxIterations;k++){
+      int peri_k  = k % mmax;
      int peri_kp = (k+1) % mmax;
      rtz=rtzp;
-      d= PcgM3(p,mmp);
+      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
      a = rtz/d;
      // Memorise this
      pAp[peri_k] = d;
-      axpy(x,a,p,x);
+      axpy(x,a,p[peri_k],x);
-      RealD rn = axpy_norm(r,-a,mmp,r);
+      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
-      PcgM1(r,z);
+      // Compute z = M x
      M1(r,z,tmp,mp);
      rtzp =real(innerProduct(r,z));
-      int ipcg=1; // almost free inexact preconditioned CG
+      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
-      if (ipcg) {
+
-	rptzp =real(innerProduct(rp,z));
+      p[peri_kp]=p[peri_k];
-      } else {
+
-	rptzp =0;
+      // Standard search direction  p -> z + b p    ; b = 
      b = (rtzp)/rtz;
      int northog;
      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
      for(int back=0; back < northog; back++){
 	int peri_back = (k-back)%mmax;
 	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
 	RealD beta = -pbApk/pAp[peri_back];
 	axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
      }
      b = (rtzp-rptzp)/rtz;
      PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
      axpy(p,b,p,mu);  // mu = A r
      RealD rrn=sqrt(rn/ssq);
-      RealD rtn=sqrt(rtz/ssq);
+      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
      std::cout<<GridLogMessage<<"HDCG: Pcg k= "<<k<<" residual = "<<rrn<<std::endl;
      if ( ipcg ) {
 	axpy(rp,0.0,r,r);
      }
      // Stopping condition
      if ( rn <= rsq ) { 
-	HDCGTimer.Stop();
+	HermOp(x,mmp); // Shouldn't this be something else?
-	std::cout<<GridLogMessage<<"HDCG: Pcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+	axpy(tmp,-1.0,src,mmp[0]);
-
+	
-	_FineLinop.HermOp(x,mmp);			  
+	RealD psinorm = sqrt(norm2(x));
-	axpy(tmp,-1.0,src,mmp);
+	RealD srcnorm = sqrt(norm2(src));
-
+	RealD tmpnorm = sqrt(norm2(tmp));
-	RealD  mmpnorm = sqrt(norm2(mmp));
+	RealD true_residual = tmpnorm/srcnorm;
-	RealD  xnorm   = sqrt(norm2(x));
+	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
-	RealD  srcnorm = sqrt(norm2(src));
+	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
-	RealD  tmpnorm = sqrt(norm2(tmp));
+	return k;
 	RealD  true_residual = tmpnorm/srcnorm;
 	std::cout<<GridLogMessage
 		 <<"HDCG: true residual is "<<true_residual
 		 <<" solution "<<xnorm
 		 <<" source "<<srcnorm
 		 <<" mmp "<<mmpnorm	  
 		 <<std::endl;
 	return;
      }
    }
-    std::cout<<GridLogMessage<<"HDCG: not converged"<<std::endl;
+    // Non-convergence
-    RealD  xnorm   = sqrt(norm2(x));
+    assert(0);
    RealD  srcnorm = sqrt(norm2(src));
    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
    return ;
  }
 public:
-  virtual void PcgM1(Field & in, Field & out)     =0;
+  virtual void M(Field & in,Field & out,Field & tmp) {
  virtual void Vstart(Field & x,const Field & src)=0;
  virtual void PcgM2(const Field & in, Field & out) {
    out=in;
  }
-  virtual RealD PcgM3(const Field & p, Field & mmp){
+  virtual void M1(Field & in, Field & out) {// the smoother
    RealD dd;
    _FineLinop.HermOp(p,mmp);
    ComplexD dot = innerProduct(p,mmp);
    dd=real(dot);
    return dd;
  }
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout.
  /////////////////////////////////////////////////////////////////////
 };
 template<class Field, class CoarseField, class Aggregation>
 class TwoLevelADEF2 : public TwoLevelCG<Field>
 {
 public:
  ///////////////////////////////////////////////////////////////////////////////////
  // Need something that knows how to get from Coarse to fine and back again
  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
  ///////////////////////////////////////////////////////////////////////////////////
  GridBase *coarsegrid;
  Aggregation &_Aggregates;                    
  LinearFunction<CoarseField> &_CoarseSolver;
  LinearFunction<CoarseField> &_CoarseSolverPrecise;
  ///////////////////////////////////////////////////////////////////////////////////
  // more most opertor functions
  TwoLevelADEF2(RealD tol,
 		Integer maxit,
 		LinearOperatorBase<Field>    &FineLinop,
 		LinearFunction<Field>        &Smoother,
 		LinearFunction<CoarseField>  &CoarseSolver,
 		LinearFunction<CoarseField>  &CoarseSolverPrecise,
 		Aggregation &Aggregates
 		) :
      TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
      _CoarseSolver(CoarseSolver),
      _CoarseSolverPrecise(CoarseSolverPrecise),
      _Aggregates(Aggregates)
  {
    coarsegrid = Aggregates.CoarseGrid;
  };
  virtual void PcgM1(Field & in, Field & out)
  {
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
    Field tmp(grid);
    Field Min(grid);
-    Field tmp(this->grid);
+    PcgM(in,Min); // Smoother call
    Field Min(this->grid);
    CoarseField PleftProj(this->coarsegrid);
    CoarseField PleftMss_proj(this->coarsegrid);
-    GridStopWatch SmootherTimer;
+    HermOp(Min,out);
    GridStopWatch MatrixTimer;
    SmootherTimer.Start();
    this->_Smoother(in,Min);
    SmootherTimer.Stop();
    MatrixTimer.Start();
    this->_FineLinop.HermOp(Min,out);
    MatrixTimer.Stop();
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
-    GridStopWatch ProjTimer;
+    ProjectToSubspace(tmp,PleftProj);     
-    GridStopWatch CoarseTimer;
+    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    GridStopWatch PromTimer;
+    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    ProjTimer.Start();
    this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     
    ProjTimer.Stop();
    CoarseTimer.Start();
    this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
    CoarseTimer.Stop();
    PromTimer.Start();
    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    PromTimer.Stop();
    std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
    std::cout << GridLogPerformance << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tProm       " << PromTimer.Elapsed() <<std::endl;
    axpy(out,1.0,Min,tmp); // Min+tmp
  }
-  virtual void Vstart(Field & x,const Field & src)
+  virtual void M2(const Field & in, Field & out) {
-  {
+    out=in;
    // Must override for Def2 only
    //  case PcgDef2:
    //    Pright(in,out);
    //    break;
  }
  virtual RealD M3(const Field & p, Field & mmp){
    double d,dd;
    HermOpAndNorm(p,mmp,d,dd);
    return dd;
    // Must override for Def1 only
    //  case PcgDef1:
    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
    //    Pleft(mp,mmp);
    //    d=real(linop_d->inner(p,mmp));
  }
  virtual void VstartDef2(Field & xconst Field & src){
    //case PcgDef2:
    //case PcgAdef2: 
    //case PcgAdef2f:
    //case PcgV11f:
    ///////////////////////////////////
    // Choose x_0 such that 
    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@ -292,73 +256,142 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(this->grid);
+    Field r(grid);
-    Field mmp(this->grid);
+    Field mmp(grid);
-    CoarseField PleftProj(this->coarsegrid);
+    
-    CoarseField PleftMss_proj(this->coarsegrid);
+    HermOp(x,mmp);
-
+    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
-    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
+    ProjectToSubspace(r,PleftProj);     
-    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  
+    PromoteFromSubspace(PleftMss_proj,mmp);  
    x=x+mmp;
  }
-};
+  virtual void Vstart(Field & x,const Field & src){
    return;
  }
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout. Override in Def1
  /////////////////////////////////////////////////////////////////////
  virtual void   Vout  (Field & in, Field & out,Field & src){
    out = in;
    //case PcgDef1:
    //    //Qb + PT x
    //    ProjectToSubspace(src,PleftProj);     
    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
    //    PromoteFromSubspace(PleftMss_proj,tmp);  
    //    
    //    Pright(in,out);
    //    
    //    linop_d->axpy(out,tmp,out,1.0);
    //    break;
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Pright and Pleft are common to all implementations
  ////////////////////////////////////////////////////////////////////////////////////////////////
  virtual void Pright(Field & in,Field & out){
    // P_R  = [ 1              0 ] 
    //        [ -Mss^-1 Msb    0 ] 
    Field in_sbar(grid);
    ProjectToSubspace(in,PleftProj);     
    PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
    HermOp(in_sbar,out);
    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
    PromoteFromSubspace(PleftMss_proj,out);     // 
    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
  }
  virtual void Pleft (Field & in,Field & out){
    // P_L  = [ 1  -Mbs Mss^-1] 
    //        [ 0   0         ] 
    Field in_sbar(grid);
    Field    tmp2(grid);
    Field    Mtmp(grid);
    ProjectToSubspace(in,PleftProj);     
    PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
    PromoteFromSubspace(PleftMss_proj,out);
    HermOp(out,Mtmp);
    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
    PromoteFromSubspace(PleftProj,tmp2);
    axpy(out,-1.0,tmp2,Mtmp);
    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
  }
 }
 template<class Field>
-class TwoLevelADEF1defl : public TwoLevelCG<Field>
+class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
-{
+ public:
-public:
+  virtual void M(Field & in,Field & out,Field & tmp){
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
  TwoLevelADEF1defl(RealD tol,
 		   Integer maxit,
 		   LinearOperatorBase<Field>   &FineLinop,
 		   LinearFunction<Field>   &Smoother,
 		   std::vector<Field> &_evec,
 		   std::vector<RealD> &_eval) : 
    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
    evec(_evec),
    eval(_eval)
  {};
-  // Can just inherit existing M2
+  } 
-  // Can just inherit existing M3
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
  // Simple vstart - do nothing
  virtual void Vstart(Field & x,const Field & src){
    x=src; // Could apply Q
  };
  // Override PcgM1
  virtual void PcgM1(Field & in, Field & out)
  {
    int N=evec.size();
    Field Pin(this->grid);
    Field Qin(this->grid);
    //MP  + Q = M(1-AQ) + Q = M
    // // If we are eigenvector deflating in coarse space
    // // Q   = Sum_i |phi_i> 1/lambda_i <phi_i|
    // // A Q = Sum_i |phi_i> <phi_i|
    // // M(1-AQ) = M(1-proj) + Q
    Qin.Checkerboard()=in.Checkerboard();
    Qin = Zero();
    Pin = in;
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      auto ip = TensorRemove(innerProduct(tmp,in));
      axpy(Qin, ip / eval[i],tmp,Qin);
      axpy(Pin, -ip ,tmp,Pin);
    }
    this->_Smoother(Pin,out);
    out = out + Qin;
  }
-};
+  virtual void M2(Field & in, Field & out){
-NAMESPACE_END(Grid);
+  }
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
  }
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
  }
 }
 /*
 template<class Field>
 class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
 }
 template<class Field>
 class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
 }
 template<class Field>
 class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
 }
 template<class Field>
 class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
 public:
  virtual void M(Field & in,Field & out,Field & tmp); 
  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
  virtual void M2(Field & in, Field & out);
  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
 }
 */
 #endif
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -58,7 +58,6 @@ public:
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
    GRID_TRACE("ConjugateGradient");
    psi.Checkerboard() = src.Checkerboard();
    conformable(psi, src);
@ -118,13 +117,9 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      GridStopWatch IterationTimer;
      IterationTimer.Start();
      c = cp;
      MatrixTimer.Start();
@ -157,41 +152,31 @@ public:
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
-      IterationTimer.Stop();
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
      if ( (k % 500) == 0 ) {
 	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
      } else { 
 	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
 		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
      }
      // Stopping condition
      if (cp <= rsq) {
 	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-	GridBase *grid = src.Grid();
+
 	RealD DwfFlops = (1452. )*grid->gSites()*4*k
   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
-        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -49,7 +49,6 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@ -69,7 +68,6 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@ -99,7 +97,6 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@ -108,10 +105,7 @@ NAMESPACE_BEGIN(Grid);
    GridStopWatch PrecChangeTimer;
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-
+      
    precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
@ -126,7 +120,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
+      precisionChange(src_f, src_d);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@ -136,7 +130,6 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@ -145,7 +138,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
+      precisionChange(tmp_d, sol_f);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@ -157,7 +150,6 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
@ -1,213 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
    Copyright (C) 2015
    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 NAMESPACE_BEGIN(Grid);
 //Mixed precision restarted defect correction CG
 template<class FieldD,class FieldF, 
  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
 public:
  using LinearFunction<FieldD>::operator();
  RealD   Tolerance;
  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
  Integer MaxInnerIterations;
  Integer MaxOuterIterations;
  Integer MaxPatchupIterations;
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
  LinearFunction<FieldF> *guesser;
  bool updateResidual;
  MixedPrecisionConjugateGradientBatched(RealD tol, 
          Integer maxinnerit, 
          Integer maxouterit, 
          Integer maxpatchit,
          GridBase* _sp_grid, 
          LinearOperatorBase<FieldF> &_Linop_f, 
          LinearOperatorBase<FieldD> &_Linop_d,
          bool _updateResidual=true) :
    Linop_f(_Linop_f), Linop_d(_Linop_d),
    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
  void useGuesser(LinearFunction<FieldF> &g){
    guesser = &g;
  }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::vector<FieldD> srcs_d_in{src_d_in};
    std::vector<FieldD> sols_d{sol_d};
    (*this)(srcs_d_in,sols_d);
    sol_d = sols_d[0];
  }
  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
    assert(src_d_in.size() == sol_d.size());
    int NBatch = src_d_in.size();
    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
    Integer TotalOuterIterations = 0; //Number of restarts
    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
    GridStopWatch TotalTimer;
    TotalTimer.Start();
    GridStopWatch InnerCGtimer;
    GridStopWatch PrecChangeTimer;
    int cb = src_d_in[0].Checkerboard();
    std::vector<RealD> src_norm;
    std::vector<RealD> norm;
    std::vector<RealD> stop;
    GridBase* DoublePrecGrid = src_d_in[0].Grid();
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
    FieldD tmp2_d(DoublePrecGrid);
    tmp2_d.Checkerboard() = cb;
    std::vector<FieldD> src_d;
    std::vector<FieldF> src_f;
    std::vector<FieldF> sol_f;
    for (int i=0; i<NBatch; i++) {
      sol_d[i].Checkerboard() = cb;
      src_norm.push_back(norm2(src_d_in[i]));
      norm.push_back(0.);
      stop.push_back(src_norm[i] * Tolerance*Tolerance);
      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
      src_f.push_back(SinglePrecGrid);
      src_f[i].Checkerboard() = cb;
      sol_f.push_back(SinglePrecGrid);
      sol_f[i].Checkerboard() = cb;
    }
    RealD inner_tol = InnerTolerance;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      std::cout << GridLogMessage << std::endl;
      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
      bool allConverged = true;
      for (int i=0; i<NBatch; i++) {
        //Compute double precision rsd and also new RHS vector.
        Linop_d.HermOp(sol_d[i], tmp_d);
        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
        PrecChangeTimer.Start();
        precisionChange(src_f[i], src_d[i]);
        PrecChangeTimer.Stop();
        sol_f[i] = Zero();
        if(norm[i] > OuterLoopNormMult * stop[i]) {
          allConverged = false;
        }
      }
      if (allConverged) break;
      if (updateResidual) {
        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
        CG_f.Tolerance = inner_tol;
      }
      //Optionally improve inner solver guess (eg using known eigenvectors)
      if(guesser != NULL) {
        (*guesser)(src_f, sol_f);
      }
      for (int i=0; i<NBatch; i++) {
        //Inner CG
        InnerCGtimer.Start();
        CG_f(Linop_f, src_f[i], sol_f[i]);
        InnerCGtimer.Stop();
        TotalInnerIterations[i] += CG_f.IterationsToComplete;
        //Convert sol back to double and add to double prec solution
        PrecChangeTimer.Start();
        precisionChange(tmp_d, sol_f[i]);
        PrecChangeTimer.Stop();
        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
      }
    }
    //Final trial CG
    std::cout << GridLogMessage << std::endl;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
    for (int i=0; i<NBatch; i++) {
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
      CG_d(Linop_d, src_d_in[i], sol_d[i]);
      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
    }
    TotalTimer.Stop();
    std::cout << GridLogMessage << std::endl;
    for (int i=0; i<NBatch; i++) {
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
    }
    std::cout << GridLogMessage << std::endl;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -44,7 +44,7 @@ public:
  using OperatorFunction<Field>::operator();
-  //  RealD   Tolerance;
+  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@ -84,7 +84,6 @@ public:
  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
  {
    GRID_TRACE("ConjugateGradientMultiShift");
    GridBase *grid = src.Grid();
@ -183,9 +182,6 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
@ -325,8 +321,8 @@ public:
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
      IterationsToComplete = k;	
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
@ -1,373 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //PB Pure single, then double fixup
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq) : 
    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
    GridBase *DoublePrecGrid = src_d.Grid();
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
    std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldF r_f(SinglePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  rsqf[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF p_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	psi_f[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      //      ps_d[s] = src_d;
      precisionChange(ps_f[s],src_d);
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    //MdagM+m[0]
    precisionChange(p_f,p_d);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    precisionChange(tmp_d,mmp_f);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
    //    assert(norm2(tmp_d)< 1.0e-4);
    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_d,b,mmp_d,r_d);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
      precisionChange(psi_f[s],psi_d[s]);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterationsMshift;k++){    
      a = c /cp;
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(r_f, r_d);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_f[s],a,ps_f[s],r_f);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      cp=c;
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
      precisionChange(mmp_d, mmp_f); // From Float to Double
      PrecChangeTimer.Stop();
      d=real(innerProduct(p_d,mmp_d));    
      axpy(mmp_d,mass[0],p_d,mmp_d);
      RealD rn = norm2(p_d);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update single precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
 	}
      }
      c = axpy_norm(r_d,b,mmp_d,r_d);
      AXPYTimer.Stop();
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsqf[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged || k == MaxIterationsMshift-1){
 	SolverTimer.Stop();
 	for(int s=0;s<nshift;s++){
 	  precisionChange(psi_d[s],psi_f[s]);
 	}
 	if ( all_converged ){
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
 	}
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -1,416 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq) : 
    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  rsqf[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF p_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    //MdagM+m[0]
    precisionChange(p_f, p_d, pc_wk_d_to_s);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
    assert(norm2(tmp_d)< 1.0);
    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_d,b,mmp_d,r_d);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterationsMshift;k++){    
      a = c /cp;
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      d=real(innerProduct(p_d,mmp_d));    
      axpy(mmp_d,mass[0],p_d,mmp_d);
      AXPYTimer.Stop();
      RealD rn = norm2(p_d);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      c = axpy_norm(r_d,b,mmp_d,r_d);
      AXPYTimer.Stop();
      if(k % ReliableUpdateFreq == 0){
 	RealD c_old = c;
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	c = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsqf[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged || k == MaxIterationsMshift-1){
 	SolverTimer.Stop();
 	if ( all_converged ){
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
 	}
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
@ -48,7 +48,7 @@ public:
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  GridBase* SinglePrecGrid;
-  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
+  RealD Delta; //reliable update parameter
  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
  LinearOperatorBase<FieldF> *Linop_fallback;
@ -65,9 +65,7 @@ public:
      ErrorOnNoConverge(err_on_no_conv),
      DoFinalCleanup(true),
      Linop_fallback(NULL)
-  {
+  {};
    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
  };
  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
    Linop_fallback = &_Linop_fallback;
@ -75,7 +73,6 @@ public:
  }
  void operator()(const FieldD &src, FieldD &psi) {
    GRID_TRACE("ConjugateGradientReliableUpdate");
    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
    bool using_fallback = false;
@ -118,12 +115,9 @@ public:
    }
    //Single prec initialization
    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
    FieldF r_f(SinglePrecGrid);
    r_f.Checkerboard() = r.Checkerboard();
-    precisionChange(r_f, r, pc_wk_dp_to_sp);
+    precisionChange(r_f, r);
    FieldF psi_f(r_f);
    psi_f = Zero();
@ -139,8 +133,7 @@ public:
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
-    GridStopWatch PrecChangeTimer;
+
    SolverTimer.Start();
    int k = 0;
    int l = 0;
@ -179,9 +172,7 @@ public:
      // Stopping condition
      if (cp <= rsq) {
 	//Although not written in the paper, I assume that I have to add on the final solution
-	PrecChangeTimer.Start();
+	precisionChange(mmp, psi_f);
 	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 	PrecChangeTimer.Stop();
 	psi = psi + mmp;
@ -202,10 +193,7 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
 	IterationsToComplete = k;	
 	ReliableUpdatesPerformed = l;
@ -225,21 +213,14 @@ public:
      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	PrecChangeTimer.Start();
+	precisionChange(mmp, psi_f);
 	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 	PrecChangeTimer.Stop();
 	psi = psi + mmp;
 	MatrixTimer.Start();
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	MatrixTimer.Stop();
 	r = src - mmp;
 	psi_f = Zero();
-	PrecChangeTimer.Start();
+	precisionChange(r_f, r);
 	precisionChange(r_f, r, pc_wk_dp_to_sp);
 	PrecChangeTimer.Stop();
 	cp = norm2(r);
 	MaxResidSinceLastRelUp = cp;
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -419,15 +419,14 @@ until convergence
 	}
      }
-      if ( Nconv < Nstop ) {
+      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
+
      }
      eval=eval2;
      //Keep only converged
-      eval.resize(Nstop);// was Nconv
+      eval.resize(Nconv);// Nstop?
-      evec.resize(Nstop,grid);// was Nconv
+      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
    }
@ -457,7 +456,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
-    std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );
@ -465,7 +464,7 @@ until convergence
    Field& evec_k = evec[k];
-    _PolyOp(evec_k,w);    std::cout<<GridLogDebug << "PolyOp" <<std::endl;
+    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
    if(k>0) w -= lme[k-1] * evec[k-1];
@ -480,18 +479,18 @@ until convergence
    lme[k] = beta;
    if ( (k>0) && ( (k % orth_period) == 0 )) {
-      std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
    }
    if(k < Nm-1) evec[k+1] = w;
-    std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
-    std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
  }
  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@ -44,7 +44,6 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@ -146,24 +145,16 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3,
+					   RealD coarse_relax_tol=5.0e3) 
 					   int largestEvalIdxForReport=-1) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
+      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@ -186,26 +177,12 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
      RealD tmp_eval;
      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
    }
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-
+  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@ -224,13 +201,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
+
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@ -308,10 +285,6 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@ -355,8 +328,6 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@ -405,31 +376,25 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
+    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@ -440,14 +405,6 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public LinearFunction<Field>{
+template<class Field> class NormalEquations {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@ -60,7 +60,7 @@ public:
  }     
 };
-template<class Field> class HPDSolver : public LinearFunction<Field> {
+template<class Field> class HPDSolver {
 private:
  LinearOperatorBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@ -78,13 +78,13 @@ public:
  void operator() (const Field &in, Field &out){
    _Guess(in,out);
-    _HermitianSolver(_Matrix,in,out);  //M out = in
+    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
  }     
 };
-template<class Field> class MdagMSolver : public LinearFunction<Field> {
+template<class Field> class MdagMSolver {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 100; 
+    const int _MAX_ITER_EST_ = 50; 
    for (int i=0;i<_MAX_ITER_EST_;i++) { 
@ -29,8 +29,6 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -1,262 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/Aggregates.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Fobj,class CComplex,int nbasis>
 class Aggregation {
 public:
  typedef iVector<CComplex,nbasis >             siteVector;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
  GridBase *CoarseGrid;
  GridBase *FineGrid;
  std::vector<Lattice<Fobj> > subspace;
  int checkerboard;
  int Checkerboard(void){return checkerboard;}
  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
    CoarseGrid(_CoarseGrid),
    FineGrid(_FineGrid),
    subspace(nbasis,_FineGrid),
    checkerboard(_checkerboard)
  {
  };
  void Orthogonalise(void){
    CoarseScalar InnerProd(CoarseGrid); 
    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
  } 
  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
    blockProject(CoarseVec,FineVec,subspace);
  }
  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
    FineVec.Checkerboard() = subspace[0].Checkerboard();
    blockPromote(CoarseVec,FineVec,subspace);
  }
  virtual void CreateSubspaceRandom(GridParallelRNG  &RNG) {
    int nn=nbasis;
    RealD scale;
    FineField noise(FineGrid);
    for(int b=0;b<nn;b++){
      subspace[b] = Zero();
      gaussian(RNG,noise);
      scale = std::pow(norm2(noise),-0.5); 
      noise=noise*scale;
      subspace[b] = noise;
    }
  }
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
  {
    RealD scale;
    ConjugateGradient<FineField> CG(1.0e-2,100,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    for(int b=0;b<nn;b++){
      subspace[b] = Zero();
      gaussian(RNG,noise);
      scale = std::pow(norm2(noise),-0.5); 
      noise=noise*scale;
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
      for(int i=0;i<1;i++){
 	CG(hermop,noise,subspace[b]);
 	noise = subspace[b];
 	scale = std::pow(norm2(noise),-0.5); 
 	noise=noise*scale;
      }
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
      subspace[b]   = noise;
    }
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
  ////////////////////////////////////////////////////////////////////////////////////////////////
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter,
 				       int ordermin,
 				       int orderstep,
 				       double filterlo
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
    std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
 	      <<ordermin<<" step "<<orderstep
 	      <<" lo"<<filterlo<<std::endl;
    // Initial matrix element
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
    {
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
      b++;
    }
    // Generate a full sequence of Chebyshevs
    {
      lo=filterlo;
      noise=Mn;
      FineField T0(FineGrid); T0 = noise;  
      FineField T1(FineGrid); 
      FineField T2(FineGrid);
      FineField y(FineGrid);
      FineField *Tnm = &T0;
      FineField *Tn  = &T1;
      FineField *Tnp = &T2;
      // Tn=T1 = (xscale M + mscale)in
      RealD xscale = 2.0/(hi-lo);
      RealD mscale = -(hi+lo)/(hi-lo);
      hermop.HermOp(T0,y);
      T1=y*xscale+noise*mscale;
      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
 	hermop.HermOp(*Tn,y);
 	autoView( y_v , y, AcceleratorWrite);
 	autoView( Tn_v , (*Tn), AcceleratorWrite);
 	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
 	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
 	// Possible more fine grained control is needed than a linear sweep,
 	// but huge productivity gain if this is simple algorithm and not a tunable
 	int m =1;
 	if ( n>=ordermin ) m=n-ordermin;
 	if ( (m%orderstep)==0 ) { 
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
 	  hermop.Op(Mn,tmp); 
 	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
 	  b++;
 	}
 	// Cycle pointers to avoid copies
 	FineField *swizzle = Tnm;
 	Tnm    =Tn;
 	Tn     =Tnp;
 	Tnp    =swizzle;
      }
    }
    assert(b==nn);
  }
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    // New normalised noise
    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
    for(int b =0;b<nbasis;b++)
    {
      gaussian(RNG,noise);
      scale = std::pow(norm2(noise),-0.5); 
      noise=noise*scale;
      // Initial matrix element
      hermop.Op(noise,Mn);
      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp); 
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@ -1,449 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 NAMESPACE_BEGIN(Grid);
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
 class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
  typedef iVector<CComplex,nbasis >           siteVector;
  typedef iMatrix<CComplex,nbasis >           siteMatrix;
  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
  typedef iMatrix<CComplex,nbasis >  Cobj;
  typedef iVector<CComplex,nbasis >  Cvec;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
  typedef CoarseVector Field;
  ////////////////////
  // Data members
  ////////////////////
  int hermitian;
  GridBase      *       _FineGrid; 
  GridCartesian *       _CoarseGrid; 
  NonLocalStencilGeometry &geom;
  PaddedCell Cell;
  GeneralLocalStencil Stencil;
  std::vector<CoarseMatrix> _A;
  std::vector<CoarseMatrix> _Adag;
  std::vector<CoarseVector> MultTemporaries;
  ///////////////////////
  // Interface
  ///////////////////////
  GridBase      * Grid(void)           { return _FineGrid; };   // this is all the linalg routines need to know
  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
  {
    int nfound=0;
    std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
    for(int p=0;p<geom.npoint;p++){
      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
 	// Search for the same relative shift
 	// Avoids brutal handling of Grid pointers
 	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
 	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
 	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
 	  nfound++;
 	}
      }
    }
    assert(nfound==geom.npoint);
    ExchangeCoarseLinks();
  }
  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
    : geom(_geom),
      _FineGrid(FineGrid),
      _CoarseGrid(CoarseGrid),
      hermitian(1),
      Cell(_geom.Depth(),_CoarseGrid),
      Stencil(Cell.grids.back(),geom.shifts)
  {
    {
      int npoint = _geom.npoint;
      autoView( Stencil_v  , Stencil, AcceleratorRead);
      int osites=Stencil.Grid()->oSites();
      for(int ss=0;ss<osites;ss++){
 	for(int point=0;point<npoint;point++){
 	  auto SE = Stencil_v.GetEntry(point,ss);
 	  int o = SE->_offset;
 	  assert( o< osites);
 	}
      }    
    }
    _A.resize(geom.npoint,CoarseGrid);
    _Adag.resize(geom.npoint,CoarseGrid);
  }
  void M (const CoarseVector &in, CoarseVector &out)
  {
    Mult(_A,in,out);
  }
  void Mdag (const CoarseVector &in, CoarseVector &out)
  {
    if ( hermitian ) M(in,out);
    else Mult(_Adag,in,out);
  }
  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
  {
    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
    ttot=-usecond();
    conformable(CoarseGrid(),in.Grid());
    conformable(in.Grid(),out.Grid());
    out.Checkerboard() = in.Checkerboard();
    CoarseVector tin=in;
    texch-=usecond();
    CoarseVector pin = Cell.ExchangePeriodic(tin);
    texch+=usecond();
    CoarseVector pout(pin.Grid());
    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    typedef LatticeView<Cvec> Vview;
    const int Nsimd = CComplex::Nsimd();
    int64_t osites=pin.Grid()->oSites();
    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
                + 2.0*osites*sizeof(siteVector)*npoint;
    {
      tviews-=usecond();
      autoView( in_v , pin, AcceleratorRead);
      autoView( out_v , pout, AcceleratorWriteDiscard);
      autoView( Stencil_v  , Stencil, AcceleratorRead);
      tviews+=usecond();
      // Static and prereserve to keep UVM region live and not resized across multiple calls
      ttemps-=usecond();
      MultTemporaries.resize(npoint,pin.Grid());       
      ttemps+=usecond();
      std::vector<Aview> AcceleratorViewContainer_h;
      std::vector<Vview> AcceleratorVecViewContainer_h; 
      tviews-=usecond();
      for(int p=0;p<npoint;p++) {
 	AcceleratorViewContainer_h.push_back(      A[p].View(AcceleratorRead));
 	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
      }
      tviews+=usecond();
      static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
      static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
      auto Aview_p = &AcceleratorViewContainer[0];
      auto Vview_p = &AcceleratorVecViewContainer[0];
      tcopy-=usecond();
      acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
      acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
      tcopy+=usecond();
      tmult-=usecond();
      accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int32_t ss   = spb/(nbasis*npoint);
 	  int32_t bp   = spb%(nbasis*npoint);
 	  int32_t b    = bp/npoint;
 	  int32_t point= bp%npoint;
 	  auto SE  = Stencil_v.GetEntry(point,ss);
 	  auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
 	  auto res = coalescedRead(Aview_p[point][ss](b,0))*nbr(0);
 	  for(int bb=1;bb<nbasis;bb++) {
 	    res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	  }
 	  coalescedWrite(Vview_p[point][ss](b),res);
      });
      accelerator_for(sb, osites*nbasis, Nsimd, {
 	  int ss = sb/nbasis;
 	  int b  = sb%nbasis;
 	  auto res = coalescedRead(Vview_p[0][ss](b));
 	  for(int point=1;point<npoint;point++){
 	    res = res + coalescedRead(Vview_p[point][ss](b));
 	  }
 	  coalescedWrite(out_v[ss](b),res);
      });
      tmult+=usecond();
      for(int p=0;p<npoint;p++) {
 	AcceleratorViewContainer_h[p].ViewClose();
 	AcceleratorVecViewContainer_h[p].ViewClose();
      }
    }
    text-=usecond();
    out = Cell.Extract(pout);
    text+=usecond();
    ttot+=usecond();
    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
    //    std::cout << GridLogPerformance<<std::endl;
    //    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
    //    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
    //    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
    //    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
  };
  void PopulateAdag(void)
  {
    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
      Coordinate bcoor;
      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
      for(int p=0;p<geom.npoint;p++){
 	Coordinate scoor = bcoor;
 	for(int mu=0;mu<bcoor.size();mu++){
 	  int L = CoarseGrid()->GlobalDimensions()[mu];
 	  scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
 	}
 	// Flip to poke/peekLocalSite and not too bad
 	auto link = peekSite(_A[p],scoor);
 	int pp = geom.Reverse(p);
 	pokeSite(adj(link),_Adag[pp],bcoor);
      }
    }
  }
  /////////////////////////////////////////////////////////////
  // 
  // A) Only reduced flops option is to use a padded cell of depth 4
  // and apply MpcDagMpc in the padded cell.
  //
  // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
  // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
  // Cost is 81x more, same as stencil size.
  //
  // But: can eliminate comms and do as local dirichlet.
  //
  // Local exchange gauge field once.
  // Apply to all vectors, local only computation.
  // Must exchange ghost subcells in reverse process of PaddedCell to take inner products
  //
  // B) Can reduce cost: pad by 1, apply Deo      (4^4+6^4+8^4+8^4 )/ (4x 4^4)
  //                     pad by 2, apply Doe
  //                     pad by 3, apply Deo
  //                     then break out 8x directions; cost is ~10x MpcDagMpc per vector
  //
  // => almost factor of 10 in setup cost, excluding data rearrangement
  //
  // Intermediates -- ignore the corner terms, leave approximate and force Hermitian
  // Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
  /////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////
    // BFM HDCG style approach: Solve a system of equations to get Aij
    //////////////////////////////////////////////////////////
    /*
     *     Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
     *
     *     conj(phases[block]) proj[k][ block*Nvec+j ] =  \sum_ball  e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > 
     *                                                 =  \sum_ball e^{iqk.delta} A_ji
     *
     *     Must invert matrix M_k,l = e^[i q_k . delta_l]
     *
     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
     */
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
    RealD tproj=0.0;
    RealD teigen=0.0;
    RealD tmat=0.0;
    RealD tphase=0.0;
    RealD tinv=0.0;
    /////////////////////////////////////////////////////////////
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid()); 
    blockOrthogonalise(InnerProd,Subspace.subspace);
    const int npoint = geom.npoint;
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
    int Nd = CoarseGrid()->Nd();
      /*
       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
       *     Matrix index i is mapped to this shift via 
       *               geom.shifts[i]
       *
       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
       *       = M_{kl} A_ji^{b.b+l}
       *
       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
       *  
       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
       *
       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
       */
    teigen-=usecond();
    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
    ComplexD ci(0.0,1.0);
    for(int k=0;k<npoint;k++){ // Loop over momenta
      for(int l=0;l<npoint;l++){ // Loop over nbr relative
 	ComplexD phase(0.0,0.0);
 	for(int mu=0;mu<Nd;mu++){
 	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
 	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
 	}
 	phase=exp(phase*ci);
 	Mkl(k,l) = phase;
      }
    }
    invMkl = Mkl.inverse();
    teigen+=usecond();
    ///////////////////////////////////////////////////////////////////////
    // Now compute the matrix elements of linop between the orthonormal
    // set of vectors.
    ///////////////////////////////////////////////////////////////////////
    FineField phaV(grid); // Phased block basis vector
    FineField MphaV(grid);// Matrix applied
    CoarseVector coarseInner(CoarseGrid());
    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
    for(int i=0;i<nbasis;i++){// Loop over basis vectors
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	/////////////////////////////////////////////////////
 	// Stick a phase on every block
 	/////////////////////////////////////////////////////
 	tphase-=usecond();
 	CoarseComplexField coor(CoarseGrid());
 	CoarseComplexField pha(CoarseGrid());	pha=Zero();
 	for(int mu=0;mu<Nd;mu++){
 	  LatticeCoordinate(coor,mu);
 	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
 	  pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
 	}
 	pha  =exp(pha*ci);
 	phaV=Zero();
 	blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
 	tphase+=usecond();
 	/////////////////////////////////////////////////////////////////////
 	// Multiple phased subspace vector by matrix and project to subspace
 	// Remove local bulk phase to leave relative phases
 	/////////////////////////////////////////////////////////////////////
 	tmat-=usecond();
 	linop.Op(phaV,MphaV);
 	tmat+=usecond();
 	tproj-=usecond();
 	blockProject(coarseInner,MphaV,Subspace.subspace);
 	coarseInner = conjugate(pha) * coarseInner;
 	ComputeProj[p] = coarseInner;
 	tproj+=usecond();
      }
      tinv-=usecond();
      for(int k=0;k<npoint;k++){
 	FT[k] = Zero();
 	for(int l=0;l<npoint;l++){
 	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
 	}
 	int osites=CoarseGrid()->oSites();
 	autoView( A_v  , _A[k], AcceleratorWrite);
 	autoView( FT_v  , FT[k], AcceleratorRead);
 	accelerator_for(sss, osites, 1, {
 	    for(int j=0;j<nbasis;j++){
 	      A_v[sss](j,i) = FT_v[sss](j);
 	    }
        });
      }
      tinv+=usecond();
    }
    for(int p=0;p<geom.npoint;p++){
      Coordinate coor({0,0,0,0,0});
      auto sval = peekSite(_A[p],coor);
    }
    // Only needed if nonhermitian
    if ( ! hermitian ) {
      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
      PopulateAdag();
    }
    // Need to write something to populate Adag from A
    ExchangeCoarseLinks();
    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
  }
  void ExchangeCoarseLinks(void){
    for(int p=0;p<geom.npoint;p++){
      _A[p] = Cell.Exchange(_A[p]);
      _Adag[p]= Cell.Exchange(_Adag[p]);
    }
  }
  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/Geometry.h
+++ b/Grid/algorithms/multigrid/Geometry.h
@ -1,243 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////
 // Geometry class in cartesian case
 /////////////////////////////////////////////////////////////////
 class Geometry {
 public:
  int npoint;
  int base;
  std::vector<int> directions   ;
  std::vector<int> displacements;
  std::vector<int> points_dagger;
  Geometry(int _d)  {
    base = (_d==5) ? 1:0;
    // make coarse grid stencil for 4d , not 5d
    if ( _d==5 ) _d=4;
    npoint = 2*_d+1;
    directions.resize(npoint);
    displacements.resize(npoint);
    points_dagger.resize(npoint);
    for(int d=0;d<_d;d++){
      directions[d   ] = d+base;
      directions[d+_d] = d+base;
      displacements[d  ] = +1;
      displacements[d+_d]= -1;
      points_dagger[d   ] = d+_d;
      points_dagger[d+_d] = d;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
    points_dagger[2*_d]=2*_d;
  }
  int point(int dir, int disp) {
    assert(disp == -1 || disp == 0 || disp == 1);
    assert(base+0 <= dir && dir < base+4);
    // directions faster index = new indexing
    // 4d (base = 0):
    // point 0  1  2  3  4  5  6  7  8
    // dir   0  1  2  3  0  1  2  3  0
    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
    // 5d (base = 1):
    // point 0  1  2  3  4  5  6  7  8
    // dir   1  2  3  4  1  2  3  4  0
    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
    // displacements faster index = old indexing
    // 4d (base = 0):
    // point 0  1  2  3  4  5  6  7  8
    // dir   0  0  1  1  2  2  3  3  0
    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
    // 5d (base = 1):
    // point 0  1  2  3  4  5  6  7  8
    // dir   1  1  2  2  3  3  4  4  0
    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
    if(dir == 0 and disp == 0)
      return 8;
    else // New indexing
      return (1 - disp) / 2 * 4 + dir - base;
    // else // Old indexing
    //   return (4 * (dir - base) + 1 - disp) / 2;
  }
 };
 /////////////////////////////////////////////////////////////////
 // Less local equivalent of Geometry class in cartesian case
 /////////////////////////////////////////////////////////////////
 class NonLocalStencilGeometry {
 public:
  int depth;
  int hops;
  int npoint;
  std::vector<Coordinate> shifts;
  Coordinate stencil_size;
  Coordinate stencil_lo;
  Coordinate stencil_hi;
  GridCartesian *grid;
  GridCartesian *Grid() {return grid;};
  int Depth(void){return 1;};   // Ghost zone depth
  int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
  virtual int DimSkip(void) =0;
  virtual ~NonLocalStencilGeometry() {};
  int  Reverse(int point)
  {
    int Nd = Grid()->Nd();
    Coordinate shft = shifts[point];
    Coordinate rev(Nd);
    for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
    for(int p=0;p<npoint;p++){
      if(rev==shifts[p]){
 	return p;
      }
    }
    assert(0);
    return -1;
  }
  void BuildShifts(void)
  {
    this->shifts.resize(0);
    int Nd = this->grid->Nd();
    int dd = this->DimSkip();
    for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
    for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
    for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
    for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
      Coordinate sft(Nd,0);
      sft[dd+0] = s0;
      sft[dd+1] = s1;
      sft[dd+2] = s2;
      sft[dd+3] = s3;
      int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
      if(nhops<=this->hops) this->shifts.push_back(sft);
    }}}}
    this->npoint = this->shifts.size();
    std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
  }
  NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops) : grid(_coarse_grid), hops(_hops)
  {
    Coordinate latt = grid->GlobalDimensions();
    stencil_size.resize(grid->Nd());
    stencil_lo.resize(grid->Nd());
    stencil_hi.resize(grid->Nd());
    for(int d=0;d<grid->Nd();d++){
     if ( latt[d] == 1 ) {
      stencil_lo[d] = 0;
      stencil_hi[d] = 0;
      stencil_size[d]= 1;
     } else if ( latt[d] == 2 ) {
      stencil_lo[d] = -1;
      stencil_hi[d] = 0;
      stencil_size[d]= 2;
     } else if ( latt[d] > 2 ) {
       stencil_lo[d] = -1;
       stencil_hi[d] =  1;
       stencil_size[d]= 3;
     }
    }
  };
 };
 // Need to worry about red-black now
 class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
 public:
  virtual int DimSkip(void) { return 0;};
  NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops) { };
  virtual ~NonLocalStencilGeometry4D() {};
 };
 class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
 public:
  virtual int DimSkip(void) { return 1; }; 
  NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops)  { };
  virtual ~NonLocalStencilGeometry5D() {};
 };
 /*
 * Bunch of different options classes
 */
 class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
 public:
  NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,4)
  {
    this->BuildShifts();
  };
 };
 class NextToNextToNextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
 public:
  NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,4)
  {
    this->BuildShifts();
  };
 };
 class NextToNearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
 public:
  NextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,2)
  {
    this->BuildShifts();
  };
 };
 class NextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
 public:
  NextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,2)
  {
    this->BuildShifts();
  };
 };
 class NearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
 public:
  NearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,1)
  {
    this->BuildShifts();
  };
 };
 class NearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
 public:
  NearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,1)
  {
    this->BuildShifts();
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/MultiGrid.h
+++ b/Grid/algorithms/multigrid/MultiGrid.h
@ -1,33 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: Grid/algorithms/multigrid/MultiGrid.h
    Copyright (C) 2023
 Author: Peter Boyle <pboyle@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 #include <Grid/algorithms/multigrid/Aggregates.h>
 #include <Grid/algorithms/multigrid/Geometry.h>
 #include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
 #include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@ -4,14 +4,11 @@ NAMESPACE_BEGIN(Grid);
 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuHuge  (1)
+#define CpuSmall (1)
-#define CpuSmall (2)
+#define Acc      (2)
-#define Acc      (3)
+#define AccSmall (3)
-#define AccHuge  (4)
+#define Shared   (4)
-#define AccSmall (5)
+#define SharedSmall (5)
 #define Shared   (6)
 #define SharedHuge  (7)
 #define SharedSmall (8)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
@ -38,15 +35,12 @@ void MemoryManager::PrintBytes(void)
 }
 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
 uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@ -176,16 +170,6 @@ void MemoryManager::Init(void)
    }
  }
  str= getenv("GRID_ALLOC_NCACHE_HUGE");
  if ( str ) {
    Nc = atoi(str);
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[CpuHuge]=Nc;
      Ncache[AccHuge]=Nc;
      Ncache[SharedHuge]=Nc;
    }
  }
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
@ -206,9 +190,7 @@ void MemoryManager::InitMessage(void) {
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
 #endif
 #ifdef GRID_UVM
@ -240,11 +222,8 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
+  int cache = type + small;
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
  else                                     cache = type;
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
@ -253,12 +232,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  if (ncache == 0) return ptr;
  void * ret = NULL;
  int v = -1;
@ -293,11 +271,8 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
+  int cache = type+small;
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
  else                                     cache = type;
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
@ -306,6 +281,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@ -35,7 +35,6 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
 #define GRID_ALLOC_SMALL_LIMIT (4096)
 #define GRID_ALLOC_HUGE_LIMIT  (2147483648)
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
@ -71,21 +70,6 @@ enum ViewMode {
  CpuWriteDiscard = 0x10 // same for now
 };
 struct MemoryStatus {
  uint64_t     DeviceBytes;
  uint64_t     DeviceLRUBytes;
  uint64_t     DeviceMaxBytes;
  uint64_t     HostToDeviceBytes;
  uint64_t     DeviceToHostBytes;
  uint64_t     HostToDeviceXfer;
  uint64_t     DeviceToHostXfer;
  uint64_t     DeviceEvictions;
  uint64_t     DeviceDestroy;
  uint64_t     DeviceAllocCacheBytes;
  uint64_t     HostAllocCacheBytes;
 };
 class MemoryManager {
 private:
@ -99,7 +83,7 @@ private:
  } AllocationCacheEntry;
  static const int NallocCacheMax=128; 
-  static const int NallocType=9;
+  static const int NallocType=6;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@ -137,26 +121,7 @@ private:
  static uint64_t     DeviceToHostXfer;
  static uint64_t     DeviceEvictions;
  static uint64_t     DeviceDestroy;
-  
+ 
  static uint64_t     DeviceCacheBytes();
  static uint64_t     HostCacheBytes();
  static MemoryStatus GetFootprint(void) {
    MemoryStatus stat;
    stat.DeviceBytes       = DeviceBytes;
    stat.DeviceLRUBytes    = DeviceLRUBytes;
    stat.DeviceMaxBytes    = DeviceMaxBytes;
    stat.HostToDeviceBytes = HostToDeviceBytes;
    stat.DeviceToHostBytes = DeviceToHostBytes;
    stat.HostToDeviceXfer  = HostToDeviceXfer;
    stat.DeviceToHostXfer  = DeviceToHostXfer;
    stat.DeviceEvictions   = DeviceEvictions;
    stat.DeviceDestroy     = DeviceDestroy;
    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
    stat.HostAllocCacheBytes   = HostCacheBytes();
    return stat;
  };
 private:
 #ifndef GRID_UVM
  //////////////////////////////////////////////////////////////////////
@ -209,9 +174,9 @@ private:
  static void     CpuViewClose(uint64_t Ptr);
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 #endif
  static void NotifyDeletion(void * CpuPtr);
 public:
  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
  static void PrintAll(void);
  static void PrintState( void* CpuPtr);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -8,7 +8,7 @@ NAMESPACE_BEGIN(Grid);
 static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
 //#define dprintf(...) 
@ -111,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -141,11 +141,11 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
-  if (AccCache.accLock!=0) return;
+  assert(AccCache.accLock==0); // Cannot evict so logic bomb
-  if (AccCache.cpuLock!=0) return;
+  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
@ -155,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@ -169,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -184,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@ -519,6 +519,7 @@ void MemoryManager::Audit(std::string s)
  uint64_t LruBytes1=0;
  uint64_t LruBytes2=0;
  uint64_t LruCnt=0;
  uint64_t LockedBytes=0;
  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
  for(auto it=LRU.begin();it!=LRU.end();it++){
@ -531,7 +532,6 @@ void MemoryManager::Audit(std::string s)
    assert(AccCache.LRU_entry==it);
  }
  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
@ -548,7 +548,6 @@ void MemoryManager::Audit(std::string s)
    if ( AccCache.cpuLock || AccCache.accLock ) {
      assert(AccCache.LRU_valid==0);
      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 		<< "\t cpuLock  " << AccCache.cpuLock
@ -567,7 +566,6 @@ void MemoryManager::Audit(std::string s)
  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
  assert(LruCnt == LRU.size());
  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
 }
 void MemoryManager::PrintState(void* _CpuPtr)
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@ -70,8 +70,8 @@ public:
  Coordinate _istride;    // Inner stride i.e. within simd lane
  int _osites;                  // _isites*_osites = product(dimensions).
  int _isites;
-  int64_t _fsites;                  // _isites*_osites = product(dimensions).
+  int _fsites;                  // _isites*_osites = product(dimensions).
-  int64_t _gsites;
+  int _gsites;
  Coordinate _slice_block;// subslice information
  Coordinate _slice_stride;
  Coordinate _slice_nblock;
@ -183,7 +183,7 @@ public:
  inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
  inline int oSites(void) const { return _osites; };
  inline int lSites(void) const { return _isites*_osites; }; 
-  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
+  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
  inline int Nd    (void) const { return _ndimension;};
  inline const Coordinate LocalStarts(void)             { return _lstart;    };
@ -214,7 +214,7 @@ public:
  ////////////////////////////////////////////////////////////////
  // Global addressing
  ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
    assert(gidx< gSites());
    Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
  }
@ -222,7 +222,7 @@ public:
    assert(lidx<lSites());
    Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
  }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
    gidx=0;
    int mult=1;
    for(int mu=0;mu<_ndimension;mu++) {
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -53,11 +53,10 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  int              _processor;       // linear processor rank
  unsigned long    _ndimension;
  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  Coordinate _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@ -98,16 +97,14 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
-  int                ProcessorCount(void)    ;
+  int                      ProcessorCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  static void BarrierWorld(void);
  ////////////////////////////////////////////////////////////
  // Reduction
@ -131,21 +128,13 @@ public:
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
-    scalar_type * ptr = (scalar_type *)& o; // Safe alias 
+    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
  void CommsComplete(std::vector<CommsRequest_t> &list);
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			   void *xmit,
 			   int dest,
 			   void *recv,
 			   int from,
 			   int bytes,int dir);
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
@ -153,17 +142,17 @@ public:
 		      int bytes);
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,int do_xmit,
+			       int xmit_to_rank,
 			       void *recv,
-			       int recv_from_rank,int do_recv,
+			       int recv_from_rank,
 			       int bytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,int do_xmit,
+				    int xmit_to_rank,
 				    void *recv,
-				    int recv_from_rank,int do_recv,
+				    int recv_from_rank,
-				    int xbytes,int rbytes,int dir);
+				    int bytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@ -124,13 +124,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-  Coordinate shm_processors       (_ndimension,1);
+
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
    shm_processors       [pad+d]=parent._shm_processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@ -155,7 +154,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@ -306,44 +304,6 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes,int dir)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  assert(dest != _processor);
  assert(from != _processor);
  int tag;
  tag= dir+from*32;
  int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
  assert(ierr==0);
  list.push_back(rrq);
  tag= dir+_processor*32;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
  assert(ierr==0);
  list.push_back(xrq);
 }
 void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
@ -375,23 +335,23 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest, int dox,
+						     int dest,
 						     void *recv,
-						     int from, int dor,
+						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,int dox,
+							 int dest,
 							 void *recv,
-							 int from,int dor,
+							 int from,
-							 int xbytes,int rbytes,int dir)
+							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
@ -410,34 +370,39 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;
-  if ( dor ) {
+  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+from*32;
-      tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+    assert(ierr==0);
-      assert(ierr==0);
+    list.push_back(rrq);
-      list.push_back(rrq);
+    off_node_bytes+=bytes;
      off_node_bytes+=rbytes;
    }
  }
-  
+
-  if (dox) {
+  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+_processor*32;
-      tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+    assert(ierr==0);
-      assert(ierr==0);
+    list.push_back(xrq);
-      list.push_back(xrq);
+    off_node_bytes+=bytes;
-      off_node_bytes+=xbytes;
+  } else {
-    } else {
+    // TODO : make a OMP loop on CPU, call threaded bcopy
-      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
+    assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
-    }
+    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
  }
  //  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
  //    this->StencilSendToRecvFromComplete(list,dir);
  //  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
  int nreq=list.size();
  if (nreq==0) return;
@ -473,10 +438,6 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
  int ierr = MPI_Barrier(communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -45,14 +45,12 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@ -91,17 +89,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
 void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes,int dir)
 {
  assert(0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@ -115,7 +102,6 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@ -125,21 +111,21 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,int dox,
+						     int xmit_to_rank,
 						     void *recv,
-						     int recv_from_rank,int dor,
+						     int recv_from_rank,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,int dox,
+							 int xmit_to_rank,
 							 void *recv,
-							 int recv_from_rank,int dor,
+							 int recv_from_rank,
-							 int xbytes,int rbytes, int dir)
+							 int bytes, int dir)
 {
-  return xbytes+rbytes;
+  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@ -91,59 +91,6 @@ void *SharedMemory::ShmBufferSelf(void)
  //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
  return ShmCommBufs[ShmRank];
 }
 static inline int divides(int a,int b)
 {
  return ( b == ( (b/a)*a ) );
 }
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
  ////////////////////////////////////////////////////////////////
  // Allow user to configure through environment variable
  ////////////////////////////////////////////////////////////////
  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
  if ( str ) {
    std::vector<int> IntShmDims;
    GridCmdOptionIntVector(std::string(str),IntShmDims);
    assert(IntShmDims.size() == WorldDims.size());
    long ShmSize = 1;
    for (int dim=0;dim<WorldDims.size();dim++) {
      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
      assert(divides(ShmDims[dim],WorldDims[dim]));
    }
    assert(ShmSize == WorldShmSize);
    return;
  }
  ////////////////////////////////////////////////////////////////
  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
  int ndimension = WorldDims.size();
  ShmDims=Coordinate(ndimension,1);
  std::vector<int> primes({2,3,5});
  int dim = 0;
  int last_dim = ndimension - 1;
  int AutoShmSize = 1;
  while(AutoShmSize != WorldShmSize) {
    int p;
    for(p=0;p<primes.size();p++) {
      int prime=primes[p];
      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
        && divides(prime,WorldShmSize/AutoShmSize)  ) {
  AutoShmSize*=prime;
  ShmDims[dim]*=prime;
  last_dim = dim;
  break;
      }
    }
    if (p == primes.size() && last_dim == dim) {
      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
      exit(EXIT_FAILURE);
    }
    dim=(dim+1) %ndimension;
  }
 }
 NAMESPACE_END(Grid); 
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -93,10 +93,9 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -27,8 +27,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 *************************************************************************************/
 /*  END LEGAL */
 #define Mheader "SharedMemoryMpi: "
 #include <Grid/GridCore.h>
 #include <pwd.h>
@ -38,120 +36,12 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCL
+#ifdef GRID_SYCl
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #include <syscall.h>
 #define SHM_SOCKETS 
 #endif
-#include <sys/socket.h>
+#endif
 #include <sys/un.h>
 NAMESPACE_BEGIN(Grid); 
-
+#define header "SharedMemoryMpi: "
 #ifdef SHM_SOCKETS
 /*
 * Barbaric extra intranode communication route in case we need sockets to pass FDs
 * Forced by level_zero not being nicely designed
 */
 static int sock;
 static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
 static char sock_path[256];
 class UnixSockets {
 public:
  static void Open(int rank)
  {
    int errnum;
    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
    struct sockaddr_un sa_un = { 0 };
    sa_un.sun_family = AF_UNIX;
    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
    unlink(sa_un.sun_path);
    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
      perror("bind failure");
      exit(EXIT_FAILURE);
    }
  }
  static int RecvFileDescriptor(void)
  {
    int n;
    int fd;
    char buf[1];
    struct iovec iov;
    struct msghdr msg;
    struct cmsghdr *cmsg;
    char cms[CMSG_SPACE(sizeof(int))];
    iov.iov_base = buf;
    iov.iov_len = 1;
    memset(&msg, 0, sizeof msg);
    msg.msg_name = 0;
    msg.msg_namelen = 0;
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_control = (caddr_t)cms;
    msg.msg_controllen = sizeof cms;
    if((n=recvmsg(sock, &msg, 0)) < 0) {
      perror("recvmsg failed");
      return -1;
    }
    if(n == 0){
      perror("recvmsg returned 0");
      return -1;
    }
    cmsg = CMSG_FIRSTHDR(&msg);
    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
    return fd;
  }
  static void SendFileDescriptor(int fildes,int xmit_to_rank)
  {
    struct msghdr msg;
    struct iovec iov;
    struct cmsghdr *cmsg = NULL;
    char ctrl[CMSG_SPACE(sizeof(int))];
    char data = ' ';
    memset(&msg, 0, sizeof(struct msghdr));
    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
    iov.iov_base = &data;
    iov.iov_len = sizeof(data);
    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
    struct sockaddr_un sa_un = { 0 };
    sa_un.sun_family = AF_UNIX;
    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
    msg.msg_name = (void *)&sa_un;
    msg.msg_namelen = sizeof(sa_un);
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
    msg.msg_control = ctrl;
    cmsg = CMSG_FIRSTHDR(&msg);
    cmsg->cmsg_level = SOL_SOCKET;
    cmsg->cmsg_type = SCM_RIGHTS;
    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
    *((int *) CMSG_DATA(cmsg)) = fildes;
    sendmsg(sock, &msg, 0);
  };
 };
 #endif
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
@ -174,8 +64,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  if ( WorldRank == 0) {
-    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;  
+    std::cout << header " World communicator of size " <<WorldSize << std::endl;  
-    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;
+    std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl;
  }
  // WorldShmComm, WorldShmSize, WorldShmRank
@ -262,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@ -275,11 +165,63 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
 static inline int divides(int a,int b)
 {
  return ( b == ( (b/a)*a ) );
 }
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
  ////////////////////////////////////////////////////////////////
  // Allow user to configure through environment variable
  ////////////////////////////////////////////////////////////////
  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
  if ( str ) {
    std::vector<int> IntShmDims;
    GridCmdOptionIntVector(std::string(str),IntShmDims);
    assert(IntShmDims.size() == WorldDims.size());
    long ShmSize = 1;
    for (int dim=0;dim<WorldDims.size();dim++) {
      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
      assert(divides(ShmDims[dim],WorldDims[dim]));
    }
    assert(ShmSize == WorldShmSize);
    return;
  }
  ////////////////////////////////////////////////////////////////
  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
  int ndimension = WorldDims.size();
  ShmDims=Coordinate(ndimension,1);
-void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+  std::vector<int> primes({2,3,5});
  int dim = 0;
  int last_dim = ndimension - 1;
  int AutoShmSize = 1;
  while(AutoShmSize != WorldShmSize) {
    int p;
    for(p=0;p<primes.size();p++) {
      int prime=primes[p];
      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
        && divides(prime,WorldShmSize/AutoShmSize)  ) {
 	AutoShmSize*=prime;
 	ShmDims[dim]*=prime;
 	last_dim = dim;
 	break;
      }
    }
    if (p == primes.size() && last_dim == dim) {
      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
      exit(EXIT_FAILURE);
    }
    dim=(dim+1) %ndimension;
  }
 }
 void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@ -352,8 +294,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
-  SHM = ShmDims;
+
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@ -400,7 +341,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@ -412,8 +353,6 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
  SHM=ShmDims;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@ -452,7 +391,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
@ -537,7 +476,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
-  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  SharedMemoryZero(ShmCommBuf,bytes);
@ -580,21 +519,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  std::cout<< "Setting up IPC"<<std::endl;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef SHM_SOCKETS
  UnixSockets::Open(WorldShmRank);
 #endif
  for(int r=0;r<WorldShmSize;r++){
    MPI_Barrier(WorldShmComm);
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
@ -602,32 +536,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    void * thisBuf = ShmCommBuf;
    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
-    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
+    typedef struct { int fd; pid_t pid ; } clone_mem_t;
-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
-    
+
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
+	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
      memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
 #ifdef SHM_SOCKETS
      for(int rr=0;rr<WorldShmSize;rr++){
 	if(rr!=r){
 	  UnixSockets::SendFileDescriptor(handle.fd,rr);
 	}
      }
 #endif
    }
 #endif
 #ifdef GRID_CUDA
@ -655,7 +581,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
    { 
      MPI_Barrier(WorldShmComm);
      int ierr=MPI_Bcast(&handle,
 			 sizeof(handle),
 			 MPI_BYTE,
@ -671,10 +596,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
      int myfd;
 #ifdef SHM_SOCKETS
      myfd=UnixSockets::RecvFileDescriptor();
 #else
      std::cout<<"mapping seeking remote pid/fd "
 	       <<handle.pid<<"/"
 	       <<handle.fd<<std::endl;
@ -682,22 +603,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
-      myfd  = syscall(438,pidfd,handle.fd,0);
+      int myfd  = syscall(438,pidfd,handle.fd,0);
-      int err_t = errno;
+
-      if (myfd < 0) {
+      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
-        fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
+      
 	perror("pidfd_getfd failed ");
 	assert(0);
      }
 #endif
      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
+	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
-	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
+	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@ -732,7 +647,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
    MPI_Barrier(WorldShmComm);
  }
  _ShmAllocBytes=bytes;
@ -744,7 +658,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -781,7 +695,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@ -791,7 +705,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -838,7 +752,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@ -48,10 +48,9 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
  SHM = Coordinate(processors.size(),1);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@ -297,30 +297,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 template <typename T>
 T iDivUp(T a, T b) // Round a / b to nearest higher integer value
 { return (a % b != 0) ? (a / b + 1) : (a / b); }
 template <typename T>
 __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
 {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >= e1*e2) return;
    int n, b, o;
    n = idx / e2;
    b = idx % e2;
    o = n*stride + b;
    vector[2*idx + 0] = lo + o;
    vector[2*idx + 1] = ro + o;
 }
 #endif
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@ -345,20 +321,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;
  if(cbmask == 0x3 ){
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
 #endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@ -409,19 +377,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;
  if ( cbmask == 0x3 ) {
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
 #endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -47,4 +47,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
 #include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
-  //  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = vobj::vector_type::Nsimd();
@ -345,9 +345,7 @@ GridUnopClass(UnaryNot, Not(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnarySpTa, SpTa(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
 GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
@ -458,9 +456,7 @@ GRID_DEF_UNOP(operator!, UnaryNot);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(SpTa, UnarySpTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
 GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@ -36,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
@ -54,7 +53,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -72,7 +70,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -89,7 +86,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@ -110,7 +106,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@ -124,7 +119,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -139,7 +133,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -153,7 +146,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@ -171,7 +163,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -186,7 +177,6 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -201,7 +191,6 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -215,7 +204,6 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@ -230,7 +218,6 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class sobj,class vobj> inline
 void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
  GRID_TRACE("axpy");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@ -244,7 +231,6 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 }
 template<class sobj,class vobj> inline
 void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
  GRID_TRACE("axpby");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@ -260,13 +246,11 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
    return axpby_norm_fast(ret,a,b,x,y);
 }
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -117,7 +117,6 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@ -130,7 +129,7 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWriteDiscard);
+    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
@ -141,7 +140,6 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@ -154,7 +152,7 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWriteDiscard);
+    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
@ -165,7 +163,6 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@ -177,7 +174,7 @@ public:
    this->checkerboard=cb;
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
-    auto me  = View(AcceleratorWriteDiscard);
+    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
@ -248,7 +245,7 @@ public:
  ///////////////////////////////////////////
  // user defined constructor
  ///////////////////////////////////////////
-  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
+  Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { 
    this->_grid = grid;
    resize(this->_grid->oSites());
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
@ -291,8 +288,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
    auto me =   View(AcceleratorWrite);
    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@ -306,8 +303,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
    auto me =   View(AcceleratorWrite);
    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@ -360,7 +357,7 @@ public:
 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int64_t g=0;g<o.Grid()->_gsites;g++){
+  for(int g=0;g<o.Grid()->_gsites;g++){
    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1)
+template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
--- a/Grid/lattice/Lattice_matrix_reduction.h
+++ b/Grid/lattice/Lattice_matrix_reduction.h
@ -32,6 +32,7 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -81,6 +82,7 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -128,6 +130,7 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs.Grid();
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@ -96,6 +96,9 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
  GridBase *grid=l.Grid();
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@ -122,17 +125,14 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj>
 typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
  typename vobj::scalar_object s;
  peekSite(s,l,site);
  return s;
 }        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  GridBase *grid=l.Grid();
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@ -173,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  const vector_type *vp = (const vector_type *) &l[odx];
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    pt[w] = getlane(vp[w],idx);
+    pt[w] = vp[idx+w*Nsimd];
  }
  return;
@ -210,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  vector_type * vp = (vector_type *)&l[odx];
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    putlane(vp[w],pt[w],idx);
+    vp[idx+w*Nsimd] = pt[w];
  }
  return;
 };
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -94,7 +94,10 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  for(int i=0;i<nthread;i++){
    ssum = ssum+sumarray[i];
  } 
-  return ssum;
+  
  typedef typename vobj::scalar_object ssobj;
  ssobj ret = ssum;
  return ret;
 }
 /*
 Threaded max, don't use for now
@ -153,44 +156,33 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 }
 template<class vobj>
-inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
  Integer osites = arg.Grid()->oSites();
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  typename vobj::scalar_object ssum;
  autoView( arg_v, arg, AcceleratorRead);
-  return sum_gpu(&arg_v[0],osites);
+  ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
-  return sum_cpu(&arg_v[0],osites);
+  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
  auto ssum = rankSum(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
 template<class vobj>
-inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_gpu_large(&arg_v[0],osites);
+  auto ssum= sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_cpu(&arg_v[0],osites);
+  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif
 }
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
  auto ssum = rankSumLarge(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
@ -233,6 +225,7 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  ComplexD  nrm;
@ -242,7 +235,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  // Might make all code paths go this way.
 #if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
@ -251,31 +243,15 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);
    // This code could read coalesce
    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
+    accelerator_for( ss, sites, 1,{
-	auto x_l = left_v(ss);
+	auto x_l = left_v[ss];
-	auto y_l = right_v(ss);
+	auto y_l = right_v[ss];
-	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
+	inner_tmp_v[ss]=innerProductD(x_l,y_l);
    });
  }
 #else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, nsimd,{
 	auto x_l = left_v(ss);
 	auto y_l = right_v(ss);
 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
 #endif
  // This is in single precision and fails some tests
-  auto anrm = sumD(inner_tmp_v,sites);  
+  auto anrm = sum(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@ -308,7 +284,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(z,x);
  conformable(x,y);
-  //  typedef typename vobj::vector_typeD vector_type;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x.Grid();
@ -320,29 +297,17 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-#if 0
+
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-  accelerator_for( ss, sites, nsimd,{
+  accelerator_for( ss, sites, 1,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
+      auto tmp = a*x_v[ss]+b*y_v[ss];
-      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
+      inner_tmp_v[ss]=innerProductD(tmp,tmp);
-      coalescedWrite(z_v[ss],tmp);
+      z_v[ss]=tmp;
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 #endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@ -352,6 +317,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 {
  conformable(left,right);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  Vector<ComplexD> tmp(2);
@ -495,14 +461,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
 sliceSum(const Lattice<vobj> &Data,int orthogdim)
 {
  std::vector<typename vobj::scalar_object> result;
  sliceSum(Data,result,orthogdim);
  return result;
 }
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
@ -607,8 +565,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
 template<class vobj>
 static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			    int orthogdim,RealD scale=1.0) 
-{
+{    
  // perhaps easier to just promote A to a field and use regular madd
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@ -639,7 +596,8 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
    for(int l=0;l<Nsimd;l++){
      grid->iCoorFromIindex(icoor,l);
      int ldx =r+icoor[orthogdim]*rd;
-      av.putlane(scalar_type(a[ldx])*zscale,l);
+      scalar_type *as =(scalar_type *)&av;
      as[l] = scalar_type(a[ldx])*zscale;
    }
    tensor_reduced at; at=av;
@ -679,6 +637,7 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -732,6 +691,7 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -785,6 +745,7 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs.Grid();
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@ -211,25 +211,13 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  assert(ok);
  Integer smemSize = numThreads * sizeof(sobj);
-  // Move out of UVM
+
  // Turns out I had messed up the synchronise after move to compute stream
  // as running this on the default stream fools the synchronise
 #undef UVM_BLOCK_BUFFER  
 #ifndef UVM_BLOCK_BUFFER  
  commVector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 #else
  Vector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
-  sobj result;
+  
-  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
+  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
  accelerator_barrier();
-  result = *buffer_v;
+  auto result = buffer_v[0];
 #endif
  return result;
 }
@ -262,6 +250,8 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -361,14 +361,9 @@ public:
    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
-  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
+
-  {
+  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
-    if ( l.Grid()->_isCheckerBoarded ) {
+
      Lattice<vobj> tmp(_grid);
      fill(tmp,dist);
      pickCheckerboard(l.Checkerboard(),l,tmp);
      return;
    }
    typedef typename vobj::scalar_object scalar_object;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
@ -429,33 +424,9 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
 #if 1
    thread_for( lidx, _grid->lSites(), {
 	int64_t gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
 	Coordinate pcoor;
 	Coordinate lcoor;
 	Coordinate gcoor;
 	_grid->LocalIndexToLocalCoor(lidx,lcoor);
 	pcoor=_grid->ThisProcessorCoor();
 	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	assert(rank == _grid->ThisRank() );
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
    });
 #else
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
 	// Where is it?
 	int rank;
 	int o_idx;
@ -472,7 +443,6 @@ public:
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
 #endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
--- a/Grid/lattice/Lattice_slice_gpu.h
+++ b/Grid/lattice/Lattice_slice_gpu.h
@ -0,0 +1,126 @@
 NAMESPACE_BEGIN(Grid);
 // If NOT CUDA or HIP -- we should provide
 // -- atomicAdd(float *,float)
 // -- atomicAdd(double *,double)
 // 
 // Augment CUDA with complex atomics
 #if !defined(GRID_HIP) || !defined(GRID_CUDA)
 inline void atomicAdd(float *acc,float elem)
 {
  *acc += elem;
 }
 inline void atomicAdd(double *acc,double elem)
 {
  *acc += elem;
 }
 #endif
 inline void atomicAdd(ComplexD *accum,ComplexD & elem)
 {
  double *a_p = (double *)accum;
  double *e_p = (double *)&elem;
  for(int w=0;w<2;w++){
    atomicAdd(&a_p[w],e_p[w]);
  }
 }
 inline void atomicAdd(ComplexF *accum,ComplexF & elem)
 {
  float *a_p = (float *)accum;
  float *e_p = (float *)&elem;
  for(int w=0;w<2;w++){
    atomicAdd(&a_p[w],e_p[w]);
  }
 }
 // Augment CUDA with vobj atomics
 template<class vobj> accelerator_inline void atomicAdd(vobj *accum, vobj & elem)
 {
  typedef typename vobj::scalar_type scalar_type;
  scalar_type *a_p= (scalar_type *)accum;
  scalar_type *e_p= (scalar_type *)& elem;
  for(int w=0;w<vobj::Nsimd();w++){
    atomicAdd(&a_p[w],e_p[w]);
  }
 }
 // Atomics based slice sum
 template<class vobj> inline void sliceSumGpu(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_object::scalar_type scalar_type;
  GridBase  *grid = Data.Grid();
  assert(grid!=NULL);
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
  assert(orthogdim >= 0);
  assert(orthogdim < Nd);
  int fd=grid->_fdimensions[orthogdim];
  int ld=grid->_ldimensions[orthogdim];
  int rd=grid->_rdimensions[orthogdim];
  // Move to device memory and copy in / out
  Vector<vobj> lvSum(rd); // will locally sum vectors first
  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
  result.resize(fd); // And then global sum to return the same vector to every node 
  for(int r=0;r<rd;r++){
    lvSum[r]=Zero();
  }
  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
  autoView( Data_v, Data, AcceleratorRead);
  auto lvSum_p=&lvSum[0];
  int ostride = grid->_ostride[orthogdim]; 
  accelerator_for( ree,rd*e1*e2,1, {
    int b = ree%e2;
    int re= ree/e2;
    int n=re%e1;
    int r=re/e1;
    int so=r*ostride;
    int ss=so+n*stride+b;
    atomicAdd(&lvSum_p[r],Data_v[ss]);
  });
  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);
  for(int rt=0;rt<rd;rt++){
    extract(lvSum[rt],extracted);
    for(int idx=0;idx<Nsimd;idx++){
      grid->iCoorFromIindex(icoor,idx);
      int ldx =rt+icoor[orthogdim]*rd;
      lsSum[ldx]=lsSum[ldx]+extracted[idx];
    }
  }
  // sum over nodes.
  for(int t=0;t<fd;t++){
    int pt = t/ld; // processor plane
    int lt = t%ld;
    if ( pt == grid->_processor_coor[orthogdim] ) {
      result[t]=lsSum[lt];
    } else {
      result[t]=Zero();
    }
  }
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_trace.h
+++ b/Grid/lattice/Lattice_trace.h
@ -66,65 +66,6 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
  return ret;
 };
 template<int N, class Vec>
 Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
 {
  GridBase *grid=Umu.Grid();
  auto lvol = grid->lSites();
  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
  typedef typename Vec::scalar_type scalar;
  autoView(Umu_v,Umu,CpuRead);
  autoView(ret_v,ret,CpuWrite);
  thread_for(site,lvol,{
    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
    Coordinate lcoor;
    grid->LocalIndexToLocalCoor(site, lcoor);
    iScalar<iScalar<iMatrix<scalar, N> > > Us;
    peekLocalSite(Us, Umu_v, lcoor);
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	scalar tmp= Us()()(i,j);
 	ComplexD ztmp(real(tmp),imag(tmp));
 	EigenU(i,j)=ztmp;
      }}
    ComplexD detD  = EigenU.determinant();
    typename Vec::scalar_type det(detD.real(),detD.imag());
    pokeLocalSite(det,ret_v,lcoor);
  });
  return ret;
 }
 template<int N>
 Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
 {
  GridBase *grid=Umu.Grid();
  auto lvol = grid->lSites();
  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
  autoView(Umu_v,Umu,CpuRead);
  autoView(ret_v,ret,CpuWrite);
  thread_for(site,lvol,{
    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
    Coordinate lcoor;
    grid->LocalIndexToLocalCoor(site, lcoor);
    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
    peekLocalSite(Us, Umu_v, lcoor);
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	EigenU(i,j) = Us()()(i,j);
      }}
    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	Ui()()(i,j) = EigenUinv(i,j);
      }}
    pokeLocalSite(Ui,ret_v,lcoor);
  });
  return ret;
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -194,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
 #endif
 accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
-  precisionChange(out,in);
+  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
 }
 accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
-  precisionChange(out,in);
+  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
 }
 template<typename T1,typename T2>
@ -288,36 +288,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
  }
 }
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               const std::vector<Lattice<vobj>> &fineData,
                               const VLattice &Basis)
 {
  int NBatch = fineData.size();
  assert(coarseData.size() == NBatch);
  GridBase * fine  = fineData[0].Grid();
  GridBase * coarse= coarseData[0].Grid();
  Lattice<iScalar<CComplex>> ip(coarse);
  std::vector<Lattice<vobj>> fineDataCopy = fineData;
  autoView(ip_, ip, AcceleratorWrite);
  for(int v=0;v<nbasis;v++) {
    for (int k=0; k<NBatch; k++) {
      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
        convertType(coarseData_[sc](v),ip_[sc]);
      });
      // improve numerical stability of projection
      // |fine> = |fine> - <basis|fine> |basis>
      ip=-ip;
      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
    }
  }
 }
 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -471,13 +442,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  vobj zz = Zero();
-  accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
+  accelerator_for(sc,coarse->oSites(),1,{
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
-      auto cd = coalescedRead(zz);
+      vobj cd = zz;
      for(int sb=0;sb<blockVol;sb++){
@ -488,10 +459,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
-	cd=cd+coalescedRead(fineData_p[sf]);
+	cd=cd+fineData_p[sf];
      }
-      coalescedWrite(coarseData_p[sc],cd);
+      coarseData_p[sc] = cd;
    });
  return;
@ -619,26 +590,6 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               std::vector<Lattice<vobj>> &fineData,
                               const VLattice &Basis)
 {
  int NBatch = coarseData.size();
  assert(fineData.size() == NBatch);
  GridBase * fine   = fineData[0].Grid();
  GridBase * coarse = coarseData[0].Grid();
  for (int k=0; k<NBatch; k++)
    fineData[k]=Zero();
  for (int i=0;i<nbasis;i++) {
    for (int k=0; k<NBatch; k++) {
      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
    }
  }
 }
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
@ -697,68 +648,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
  // the above should guarantee that the operations are local
 #if 1
  size_t nsite = 1;
  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
  size_t tbytes = 4*nsite*sizeof(int);
  int *table = (int*)malloc(tbytes);
  thread_for(idx, nsite, {
      Coordinate from_coor, to_coor;
      size_t rem = idx;
      for(int i=0;i<nd;i++){
 	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
 	from_coor[i] = base_i + FromLowerLeft[i];
 	to_coor[i] = base_i + ToLowerLeft[i];
      }
      int foidx = Fg->oIndex(from_coor);
      int fiidx = Fg->iIndex(from_coor);
      int toidx = Tg->oIndex(to_coor);
      int tiidx = Tg->iIndex(to_coor);
      int* tt = table + 4*idx;
      tt[0] = foidx;
      tt[1] = fiidx;
      tt[2] = toidx;
      tt[3] = tiidx;
    });
  int* table_d = (int*)acceleratorAllocDevice(tbytes);
  acceleratorCopyToDevice(table,table_d,tbytes);
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  autoView(from_v,From,AcceleratorRead);
  autoView(to_v,To,AcceleratorWrite);
  accelerator_for(idx,nsite,1,{
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int* tt = table_d + 4*idx;
      int from_oidx = *tt++;
      int from_lane = *tt++;
      int to_oidx = *tt++;
      int to_lane = *tt;
      const vector_type* from = (const vector_type *)&from_v[from_oidx];
      vector_type* to = (vector_type *)&to_v[to_oidx];
      scalar_type stmp;
      for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
    });
  acceleratorFreeDevice(table_d);    
  free(table);
 #else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@ -767,9 +658,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;
-  autoView( t_v , To, CpuWrite);
+  autoView( t_v , To, AcceleratorWrite);
-  autoView( f_v , From, CpuRead);
+  autoView( f_v , From, AcceleratorRead);
-  thread_for(idx,Fg->lSites(),{
+  accelerator_for(idx,Fg->lSites(),1,{
    sobj s;
    Coordinate Fcoor(nd);
    Coordinate Tcoor(nd);
@ -782,24 +673,17 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
    }
    if (in_region) {
-#if 0      
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
-      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
-      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
-      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
      scalar_type * fp = (scalar_type *)&f_v[odx_f];
      scalar_type * tp = (scalar_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
-	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
+	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
      }
 #else
    peekLocalSite(s,f_v,Fcoor);
    pokeLocalSite(s,t_v,Tcoor);
 #endif
    }
  });
 #endif
 }
@ -892,8 +776,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }
 //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
 //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@ -910,70 +792,11 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
-      assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_processors[d]  == hg->_processors[d]);
-      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
-    }
+  }
  }
 #if 1
  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
  size_t tbytes = 4*nsite*sizeof(int);
  int *table = (int*)malloc(tbytes);
  thread_for(idx,nsite,{
    Coordinate lcoor(nl);
    Coordinate hcoor(nh);
    lcoor[orthog] = slice_lo;
    hcoor[orthog] = slice_hi;
    size_t rem = idx;
    for(int mu=0;mu<nl;mu++){
      if(mu != orthog){
 	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
 	lcoor[mu] = hcoor[mu] = xmu;
      }
    }
    int loidx = lg->oIndex(lcoor);
    int liidx = lg->iIndex(lcoor);
    int hoidx = hg->oIndex(hcoor);
    int hiidx = hg->iIndex(hcoor);
    int* tt = table + 4*idx;
    tt[0] = loidx;
    tt[1] = liidx;
    tt[2] = hoidx;
    tt[3] = hiidx;
    });
  int* table_d = (int*)acceleratorAllocDevice(tbytes);
  acceleratorCopyToDevice(table,table_d,tbytes);
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  autoView(lowDim_v,lowDim,AcceleratorRead);
  autoView(higherDim_v,higherDim,AcceleratorWrite);
  accelerator_for(idx,nsite,1,{
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int* tt = table_d + 4*idx;
      int from_oidx = *tt++;
      int from_lane = *tt++;
      int to_oidx = *tt++;
      int to_lane = *tt;
      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
      scalar_type stmp;
      for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
    });
  acceleratorFreeDevice(table_d);    
  free(table);
 #else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@ -989,7 +812,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
 #endif
 }
@ -1033,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@ -1054,7 +876,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
  Coordinate fcoor(nd);
  Coordinate ccoor(nd);
-  for(int64_t g=0;g<fg->gSites();g++){
+  for(int g=0;g<fg->gSites();g++){
    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
@ -1258,27 +1080,9 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
-//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
+//Convert a Lattice from one precision to another
 template<class VobjOut, class VobjIn>
-void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
  typedef typename VobjOut::vector_type Vout;
  typedef typename VobjIn::vector_type Vin;
  const int N = sizeof(VobjOut)/sizeof(Vout);
  conformable(out.Grid(),in.Grid());
  out.Checkerboard() = in.Checkerboard();
  int nsimd = out.Grid()->Nsimd();
  autoView( out_v  , out, AcceleratorWrite);
  autoView(  in_v ,   in, AcceleratorRead);
  accelerator_for(idx,out.Grid()->oSites(),1,{
      Vout *vout = (Vout *)&out_v[idx];
      Vin  *vin  = (Vin  *)&in_v[idx];
      precisionChange(vout,vin,N);
  });
 }
 //Convert a Lattice from one precision to another (original, slow implementation)
 template<class VobjOut, class VobjIn>
 void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
  assert(out.Grid()->Nd() == in.Grid()->Nd());
  for(int d=0;d<out.Grid()->Nd();d++){
@ -1293,7 +1097,7 @@ void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
-  int in_nsimd = in_grid->Nsimd();
+    
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
@ -1324,128 +1128,6 @@ void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  });
 }
 //The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
 class precisionChangeWorkspace{
  std::pair<Integer,Integer>* fmap_device; //device pointer
  //maintain grids for checking
  GridBase* _out_grid;
  GridBase* _in_grid;
 public:
  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
    assert(out_grid->Nd() == in_grid->Nd());
    for(int d=0;d<out_grid->Nd();d++){
      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
    }
    int Nsimd_out = out_grid->Nsimd();
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  void checkGrids(GridBase* out, GridBase* in) const{
    conformable(out, _out_grid);
    conformable(in, _in_grid);
  }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
 //*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
 template<class VobjOut, class VobjIn>
 auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
  if(out.Grid() == in.Grid()){
    precisionChangeFast(out,in);
    return 1;
  }else{
    return 0;
  }
 }
 template<class VobjOut, class VobjIn>
 int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
  return 0;
 }
 //Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
 //which contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  if(_precisionChangeFastWrap(out,in,0)) return;
  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  workspace.checkGrids(out.Grid(),in.Grid());
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
 //or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  if(_precisionChangeFastWrap(out,in,0)) return;   
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -1,565 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/PaddedCell.h
    Copyright (C) 2019
 Author: Peter Boyle pboyle@bnl.gov
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include<Grid/cshift/Cshift.h>
 NAMESPACE_BEGIN(Grid);
 //Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
 template<typename vobj>
 struct CshiftImplBase{
  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
  virtual ~CshiftImplBase(){}
 };
 template<typename vobj>
 struct CshiftImplDefault: public CshiftImplBase<vobj>{
  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
 };
 template<typename Gimpl>
 struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
 };  
 /*
 *
 * TODO: 
 *  -- address elementsof vobj via thread block in Scatter/Gather
 *  -- overlap comms with motion in Face_exchange
 *
 */
 template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
 					      Lattice<vobj> &lat,
 					      int x,
 					      int dim,
 					      int offset=0)
 {
  const int Nsimd=vobj::Nsimd();
  typedef typename vobj::scalar_object sobj;
  GridBase *grid = lat.Grid();
  Coordinate simd = grid->_simd_layout;
  int Nd          = grid->Nd();
  int block       = grid->_slice_block[dim];
  int stride      = grid->_slice_stride[dim];
  int nblock      = grid->_slice_nblock[dim];
  int rd          = grid->_rdimensions[dim];
  int ox = x%rd;
  int ix = x/rd;
  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
  int rNsimda= Nsimd/simd[dim]; // should be equal
  assert(rNsimda==rNsimd);
  int face_ovol=block*nblock;
  //  assert(buf.size()==face_ovol*rNsimd);
  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
  //Let's make it work on GPU and then make a special accelerator_for that
  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
  //for cross platform
  // FIXME -- can put internal indices into thread loop
  auto buf_p = & buf[0];
  autoView(lat_v, lat, AcceleratorRead);
  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
    // scalar layout won't coalesce
    int blane=acceleratorSIMTlane(Nsimd); // buffer lane
    int olane=blane%rNsimd;               // reduced lattice lane
    int obit =blane/rNsimd;
    ///////////////////////////////////////////////////////////////
    // osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
    ///////////////////////////////////////////////////////////////
    int ssp = ss*simd[dim]+obit;
    int b    = ssp%block;
    int n    = ssp/block;
    int osite= b+n*stride + ox*block;
    ////////////////////////////////////////////
    // isite -- map lane within buffer to lane within lattice
    ////////////////////////////////////////////
    Coordinate icoor;
    int lane;
    Lexicographic::CoorFromIndex(icoor,olane,rsimd);
    icoor[dim]=ix;
    Lexicographic::IndexFromCoor(icoor,lane,simd);
    ///////////////////////////////////////////
    // Transfer into lattice - will coalesce
    ///////////////////////////////////////////
    sobj obj = extractLane(blane,buf_p[ss+offset]);
    insertLane(lane,lat_v[osite],obj);
  });
 }
 template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
 					     const Lattice<vobj> &lat,
 					     int x,
 					     int dim,
 					     int offset=0)
 {
  const int Nsimd=vobj::Nsimd();
  typedef typename vobj::scalar_object sobj;
  autoView(lat_v, lat, AcceleratorRead);
  GridBase *grid = lat.Grid();
  Coordinate simd = grid->_simd_layout;
  int Nd          = grid->Nd();
  int block       = grid->_slice_block[dim];
  int stride      = grid->_slice_stride[dim];
  int nblock      = grid->_slice_nblock[dim];
  int rd          = grid->_rdimensions[dim];
  int ox = x%rd;
  int ix = x/rd;
  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
  int face_ovol=block*nblock;
  //  assert(buf.size()==face_ovol*rNsimd);
  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
  //Let's make it work on GPU and then make a special accelerator_for that
  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
  //for cross platform
  //For CPU perhaps just run a loop over Nsimd
  auto buf_p = & buf[0];
  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
    // scalar layout won't coalesce
    int blane=acceleratorSIMTlane(Nsimd); // buffer lane
    int olane=blane%rNsimd;               // reduced lattice lane
    int obit =blane/rNsimd;
    ////////////////////////////////////////////
    // osite
    ////////////////////////////////////////////
    int ssp = ss*simd[dim]+obit;
    int b    = ssp%block;
    int n    = ssp/block;
    int osite= b+n*stride + ox*block;
    ////////////////////////////////////////////
    // isite -- map lane within buffer to lane within lattice
    ////////////////////////////////////////////
    Coordinate icoor;
    int lane;
    Lexicographic::CoorFromIndex(icoor,olane,rsimd);
    icoor[dim]=ix;
    Lexicographic::IndexFromCoor(icoor,lane,simd);
    ///////////////////////////////////////////
    // Take out of lattice
    ///////////////////////////////////////////
    sobj obj = extractLane(lane,lat_v[osite]);
    insertLane(blane,buf_p[ss+offset],obj);
  });
  /*
  int words =block*nblock/simd[dim];
  std::vector<vobj> tbuf(words);
  acceleratorCopyFromDevice((void *)&buf[offset],(void *)&tbuf[0],words*sizeof(vobj));
  typedef typename vobj::scalar_type scalar;
  scalar *sbuf = (scalar *)&tbuf[0];
  scalar tmp=0.0;
  for(int w=0;w<words*sizeof(vobj)/sizeof(scalar);w++){
    tmp=tmp+conjugate(sbuf[w])*sbuf[w];
  }
  std::cout << " Gathered buffer norm "<<tmp<<std::endl;
  */
 }
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
  int dims;
  int depth;
  std::vector<GridCartesian *> grids;
  ~PaddedCell()
  {
    DeleteGrids();
  }
  PaddedCell(int _depth,GridCartesian *_grid)
  {
    unpadded_grid = _grid;
    depth=_depth;
    dims=_grid->Nd();
    AllocateGrids();
    Coordinate local     =unpadded_grid->LocalDimensions();
    Coordinate procs     =unpadded_grid->ProcessorGrid();
    for(int d=0;d<dims;d++){
      if ( procs[d] > 1 ) assert(local[d]>=depth);
    }
  }
  void DeleteGrids(void)
  {
    for(int d=0;d<grids.size();d++){
      delete grids[d];
    }
    grids.resize(0);
  };
  void AllocateGrids(void)
  {
    Coordinate local     =unpadded_grid->LocalDimensions();
    Coordinate simd      =unpadded_grid->_simd_layout;
    Coordinate processors=unpadded_grid->_processors;
    Coordinate plocal    =unpadded_grid->LocalDimensions();
    Coordinate global(dims);
    GridCartesian *old_grid = unpadded_grid;
    // expand up one dim at a time
    for(int d=0;d<dims;d++){
      if ( processors[d] > 1 ) { 
 	plocal[d] += 2*depth; 
 	for(int d=0;d<dims;d++){
 	  global[d] = plocal[d]*processors[d];
 	}
 	old_grid = new GridCartesian(global,simd,processors);
      }
      grids.push_back(old_grid);
    }
  };
  template<class vobj>
  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
    Coordinate processors=unpadded_grid->_processors;
    Lattice<vobj> out(unpadded_grid);
    Coordinate local     =unpadded_grid->LocalDimensions();
    // depends on the MPI spread      
    Coordinate fll(dims,depth);
    Coordinate tll(dims,0); // depends on the MPI spread
    for(int d=0;d<dims;d++){
      if( processors[d]==1 ) fll[d]=0;
    }
    localCopyRegion(in,out,fll,tll,local);
    return out;
  }
  template<class vobj>
  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
    Lattice<vobj> tmp = in;
    for(int d=0;d<dims;d++){
      tmp = Expand(d,tmp,cshift); // rvalue && assignment
    }
    return tmp;
  }
  template<class vobj>
  inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
    Lattice<vobj> tmp = in;
    for(int d=0;d<dims;d++){
      tmp = ExpandPeriodic(d,tmp,cshift); // rvalue && assignment
    }
    return tmp;
  }
  // expand up one dim at a time
  template<class vobj>
  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    Coordinate processors=unpadded_grid->_processors;
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
    Lattice<vobj>  padded(new_grid);
    Lattice<vobj> shifted(old_grid);    
    Coordinate local     =old_grid->LocalDimensions();
    Coordinate plocal    =new_grid->LocalDimensions();
    if(dim==0) conformable(old_grid,unpadded_grid);
    else       conformable(old_grid,grids[dim-1]);
    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
    double tins=0, tshift=0;
    int islocal = 0 ;
    if ( processors[dim] == 1 ) islocal = 1;
    if ( islocal ) {
      // replace with a copy and maybe grid swizzle
      double t = usecond();
      padded = in;
      tins += usecond() - t;
    } else {
      //////////////////////////////////////////////
      // Replace sequence with
      // ---------------------
      // (i) Gather high face(s); start comms
      // (ii) Gather low  face(s); start comms
      // (iii) Copy middle bit with localCopyRegion
      // (iv) Complete high face(s), insert slice(s)
      // (iv) Complete low  face(s), insert slice(s)
      //////////////////////////////////////////////
      // Middle bit
      double t = usecond();
      for(int x=0;x<local[dim];x++){
 	InsertSliceLocal(in,padded,x,depth+x,dim);
      }
      tins += usecond() - t;
      // High bit
      t = usecond();
      shifted = cshift.Cshift(in,dim,depth);
      tshift += usecond() - t;
      t=usecond();
      for(int x=0;x<depth;x++){
 	InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
      }
      tins += usecond() - t;
      // Low bit
      t = usecond();
      shifted = cshift.Cshift(in,dim,-depth);
      tshift += usecond() - t;
      t = usecond();
      for(int x=0;x<depth;x++){
 	InsertSliceLocal(shifted,padded,x,x,dim);
      }
      tins += usecond() - t;
    }
    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    return padded;
  }
  template<class vobj>
  inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    Coordinate processors=unpadded_grid->_processors;
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
    Lattice<vobj>  padded(new_grid);
    Lattice<vobj> shifted(old_grid);    
    Coordinate local     =old_grid->LocalDimensions();
    Coordinate plocal    =new_grid->LocalDimensions();
    if(dim==0) conformable(old_grid,unpadded_grid);
    else       conformable(old_grid,grids[dim-1]);
    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
    double tins=0, tshift=0;
    int islocal = 0 ;
    if ( processors[dim] == 1 ) islocal = 1;
    if ( islocal ) {
      // replace with a copy and maybe grid swizzle
      double t = usecond();
      padded = in;
      tins += usecond() - t;
    } else {
      //////////////////////////////////////////////
      // Replace sequence with
      // ---------------------
      // (i) Gather high face(s); start comms
      // (ii) Gather low  face(s); start comms
      // (iii) Copy middle bit with localCopyRegion
      // (iv) Complete high face(s), insert slice(s)
      // (iv) Complete low  face(s), insert slice(s)
      //////////////////////////////////////////////
      Face_exchange(in,padded,dim,depth);
    }
    return padded;
  }
  template<class vobj>
  void Face_exchange(const Lattice<vobj> &from,
 		     Lattice<vobj> &to,
 		     int dimension,int depth) const
  {
    typedef typename vobj::vector_type vector_type;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::scalar_object sobj;
    RealD t_gather=0.0;
    RealD t_scatter=0.0;
    RealD t_comms=0.0;
    RealD t_copy=0.0;
    //    std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
    //    DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
    GridBase *grid=from.Grid();
    GridBase *new_grid=to.Grid();
    Coordinate lds = from.Grid()->_ldimensions;
    Coordinate nlds=   to.Grid()->_ldimensions;
    Coordinate simd= from.Grid()->_simd_layout;
    int ld    = lds[dimension];
    int nld   = to.Grid()->_ldimensions[dimension];
    const int Nsimd = vobj::Nsimd();
    assert(depth<=lds[dimension]); // A must be on neighbouring node
    assert(depth>0);   // A caller bug if zero
    assert(ld+2*depth==nld);
    ////////////////////////////////////////////////////////////////////////////
    // Face size and byte calculations
    ////////////////////////////////////////////////////////////////////////////
    int buffer_size = 1;
    for(int d=0;d<lds.size();d++){
      if ( d!= dimension) buffer_size=buffer_size*lds[d];
    }
    buffer_size = buffer_size  / Nsimd;
    int rNsimd = Nsimd / simd[dimension];
    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
    static cshiftVector<vobj> send_buf; 
    static cshiftVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
    std::vector<CommsRequest_t> fwd_req;   
    std::vector<CommsRequest_t> bwd_req;   
    int words = buffer_size;
    int bytes = words * sizeof(vobj);
    ////////////////////////////////////////////////////////////////////////////
    // Communication coords
    ////////////////////////////////////////////////////////////////////////////
    int comm_proc = 1;
    int xmit_to_rank;
    int recv_from_rank;
    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
    ////////////////////////////////////////////////////////////////////////////
    // Gather all surface terms up to depth "d"
    ////////////////////////////////////////////////////////////////////////////
    RealD t;
    int plane=0;
    for ( int d=0;d < depth ; d ++ ) {
      int tag = d*1024 + dimension*2+0;
      t=usecond();
      GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
      t_gather+=usecond()-t;
      t=usecond();
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
      int tag = d*1024 + dimension*2+1;
      t=usecond();
      GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
      t_gather+= usecond() - t;
      t=usecond();
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
      t_comms+=usecond()-t;
    }
    ////////////////////////////////////////////////////////////////////////////
    // Copy interior -- overlap this with comms
    ////////////////////////////////////////////////////////////////////////////
    int Nd = new_grid->Nd();
    Coordinate LL(Nd,0);
    Coordinate sz = grid->_ldimensions;
    Coordinate toLL(Nd,0);
    toLL[dimension]=depth;
    t=usecond();
    localCopyRegion(from,to,LL,toLL,sz);
    t_copy= usecond() - t;
    ////////////////////////////////////////////////////////////////////////////
    // Scatter all faces
    ////////////////////////////////////////////////////////////////////////////
    //    DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
    plane=0;
    t=usecond();
    grid->CommsComplete(fwd_req);
    t_comms+= usecond() - t;
    t=usecond();
    for ( int d=0;d < depth ; d ++ ) {
      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
    }
    t_scatter= usecond() - t;
    t=usecond();
    grid->CommsComplete(bwd_req);
    t_comms+= usecond() - t;
    t=usecond();
    for ( int d=0;d < depth ; d ++ ) {
      ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
    }
    t_scatter+= usecond() - t;
    //    DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
    //DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@ -165,7 +165,7 @@ class BinaryIO {
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
-	int64_t global_site;
+	int global_site;
 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@ -175,8 +175,8 @@ class BinaryIO {
 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);
-	uint64_t gsite29   = global_site%29;
+	uint32_t gsite29   = global_site%29;
-	uint64_t gsite31   = global_site%31;
+	uint32_t gsite31   = global_site%31;
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@ -545,9 +545,7 @@ class BinaryIO {
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
-				       uint32_t &scidac_csumb,
+				       uint32_t &scidac_csumb)
 				       int control=BINARYIO_LEXICOGRAPHIC
 				       )
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@ -558,7 +556,7 @@ class BinaryIO {
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control,
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
    GridStopWatch timer; 
@ -584,8 +582,7 @@ class BinaryIO {
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
-					  uint32_t &scidac_csumb,
+					  uint32_t &scidac_csumb)
 					  int control=BINARYIO_LEXICOGRAPHIC)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@ -610,7 +607,7 @@ class BinaryIO {
    while (attemptsLeft >= 0)
    {
      grid->Barrier();
-      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control,
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	             nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
@ -620,7 +617,7 @@ class BinaryIO {
        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
-        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control,
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@ -206,7 +206,7 @@ class GridLimeReader : public BinaryIO {
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
-  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
@ -238,7 +238,7 @@ class GridLimeReader : public BinaryIO {
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control);
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
 	std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
@ -408,7 +408,7 @@ class GridLimeWriter : public BinaryIO
  // in communicator used by the field.Grid()
  ////////////////////////////////////////////////////
  template<class vobj>
-  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
@ -459,7 +459,7 @@ class GridLimeWriter : public BinaryIO
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
    ///////////////////////////////////////////
    // Wind forward and close the record
@ -512,8 +512,7 @@ class ScidacWriter : public GridLimeWriter {
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
-                              const unsigned int recordScientificPrec = 0,
+                              const unsigned int recordScientificPrec = 0) 
 			      int control=BINARYIO_LEXICOGRAPHIC)
  {
    GridBase * grid = field.Grid();
@ -535,7 +534,7 @@ class ScidacWriter : public GridLimeWriter {
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
-    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);      // Closes message with checksum
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
  }
 };
@ -554,8 +553,7 @@ class ScidacReader : public GridLimeReader {
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
-  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord,
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
 			     int control=BINARYIO_LEXICOGRAPHIC) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field.Grid();
@ -573,7 +571,7 @@ class ScidacReader : public GridLimeReader {
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@ -42,10 +42,8 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
-  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
+  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
@ -205,7 +203,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
--- a/Grid/pugixml/pugixml.cc
+++ b/Grid/pugixml/pugixml.cc
@ -16,7 +16,7 @@
 #ifdef __NVCC__
 #pragma push
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -63,7 +63,6 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@ -88,8 +87,6 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@ -104,7 +101,6 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
 template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@ -114,10 +110,8 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@ -127,7 +121,6 @@ typedef iSpinMatrix<ComplexD >          SpinMatrixD;
 typedef iSpinMatrix<vComplex >          vSpinMatrix;
 typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
 typedef iSpinMatrix<vComplexD>          vSpinMatrixD;
 typedef iSpinMatrix<vComplexD2>         vSpinMatrixD2;
 // Colour Matrix
 typedef iColourMatrix<Complex  >        ColourMatrix;
@ -137,7 +130,6 @@ typedef iColourMatrix<ComplexD >        ColourMatrixD;
 typedef iColourMatrix<vComplex >        vColourMatrix;
 typedef iColourMatrix<vComplexF>        vColourMatrixF;
 typedef iColourMatrix<vComplexD>        vColourMatrixD;
 typedef iColourMatrix<vComplexD2>       vColourMatrixD2;
 // SpinColour matrix
 typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
@ -147,7 +139,6 @@ typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
 typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
 typedef iSpinColourMatrix<vComplexD2>   vSpinColourMatrixD2;
 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@ -157,7 +148,6 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;
 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@ -167,47 +157,24 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;
 // LorentzColour
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
 typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
-typedef iLorentzColourMatrix<vComplex >  vLorentzColourMatrix;
+typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
-typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF;
+typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
-typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD;
+typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
 typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
 // LorentzComplex
 typedef iLorentzComplex<Complex  > LorentzComplex;
 typedef iLorentzComplex<ComplexF > LorentzComplexF;
 typedef iLorentzComplex<ComplexD > LorentzComplexD;
 typedef iLorentzComplex<vComplex > vLorentzComplex;
 typedef iLorentzComplex<vComplexF> vLorentzComplexF;
 typedef iLorentzComplex<vComplexD> vLorentzComplexD;
 // DoubleStored gauge field
 typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
-typedef iDoubleStoredColourMatrix<vComplex >  vDoubleStoredColourMatrix;
+typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
-typedef iDoubleStoredColourMatrix<vComplexF>  vDoubleStoredColourMatrixF;
+typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
-typedef iDoubleStoredColourMatrix<vComplexD>  vDoubleStoredColourMatrixD;
+typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 typedef iDoubleStoredColourMatrix<vComplexD2> vDoubleStoredColourMatrixD2;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex>   vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF>  vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD>  vGparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplexD2> vGparityFlavourMatrixD2;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
@ -217,7 +184,6 @@ typedef iSpinVector<ComplexD>           SpinVectorD;
 typedef iSpinVector<vComplex >           vSpinVector;
 typedef iSpinVector<vComplexF>           vSpinVectorF;
 typedef iSpinVector<vComplexD>           vSpinVectorD;
 typedef iSpinVector<vComplexD2>          vSpinVectorD2;
 // Colour vector
 typedef iColourVector<Complex >         ColourVector;
@ -227,7 +193,6 @@ typedef iColourVector<ComplexD>         ColourVectorD;
 typedef iColourVector<vComplex >         vColourVector;
 typedef iColourVector<vComplexF>         vColourVectorF;
 typedef iColourVector<vComplexD>         vColourVectorD;
 typedef iColourVector<vComplexD2>        vColourVectorD2;
 // SpinColourVector
 typedef iSpinColourVector<Complex >     SpinColourVector;
@ -237,7 +202,6 @@ typedef iSpinColourVector<ComplexD>     SpinColourVectorD;
 typedef iSpinColourVector<vComplex >     vSpinColourVector;
 typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
 typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;
 typedef iSpinColourVector<vComplexD2>    vSpinColourVectorD2;
 // HalfSpin vector
 typedef iHalfSpinVector<Complex >       HalfSpinVector;
@ -247,27 +211,15 @@ typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;
 typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
 typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
 typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;
 typedef iHalfSpinVector<vComplexD2>      vHalfSpinVectorD2;
 // HalfSpinColour vector
 typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
 typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
 typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
-typedef iHalfSpinColourVector<vComplex >  vHalfSpinColourVector;
+typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
-typedef iHalfSpinColourVector<vComplexF>  vHalfSpinColourVectorF;
+typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
-typedef iHalfSpinColourVector<vComplexD>  vHalfSpinColourVectorD;
+typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplexD2> vHalfSpinColourVectorD2;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplexD2>        vGparityFlavourVectorD2;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@ -277,7 +229,6 @@ typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tenso
 typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexD2>       vTComplexD2;   // what if we don't know the tensor structure
 typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
 typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
@ -295,62 +246,47 @@ typedef iSinglet<Integer >         TInteger;
 typedef Lattice<vColourMatrix>          LatticeColourMatrix;
 typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
 typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;
 typedef Lattice<vColourMatrixD2>        LatticeColourMatrixD2;
 typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
 typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
 typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;
 typedef Lattice<vSpinMatrixD2>          LatticeSpinMatrixD2;
 typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
 typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
 typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
 typedef Lattice<vSpinColourMatrixD2>    LatticeSpinColourMatrixD2;
 typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
 typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
 typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
 typedef Lattice<vSpinColourSpinColourMatrixD2>    LatticeSpinColourSpinColourMatrixD2;
-typedef Lattice<vLorentzColourMatrix>   LatticeLorentzColourMatrix;
+typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
-typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF;
+typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
-typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD;
+typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
 typedef Lattice<vLorentzComplex>  LatticeLorentzComplex;
 typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
 typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
 // DoubleStored gauge field
-typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix;
+typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
-typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF;
+typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
-typedef Lattice<vDoubleStoredColourMatrixD>  LatticeDoubleStoredColourMatrixD;
+typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
 typedef Lattice<vDoubleStoredColourMatrixD2> LatticeDoubleStoredColourMatrixD2;
 typedef Lattice<vSpinVector>            LatticeSpinVector;
 typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
 typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;
 typedef Lattice<vSpinVectorD2>          LatticeSpinVectorD2;
 typedef Lattice<vColourVector>          LatticeColourVector;
 typedef Lattice<vColourVectorF>         LatticeColourVectorF;
 typedef Lattice<vColourVectorD>         LatticeColourVectorD;
 typedef Lattice<vColourVectorD2>        LatticeColourVectorD2;
 typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
 typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
 typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;
 typedef Lattice<vSpinColourVectorD2>    LatticeSpinColourVectorD2;
 typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
 typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
 typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;
 typedef Lattice<vHalfSpinVectorD2>      LatticeHalfSpinVectorD2;
-typedef Lattice<vHalfSpinColourVector>   LatticeHalfSpinColourVector;
+typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
-typedef Lattice<vHalfSpinColourVectorF>  LatticeHalfSpinColourVectorF;
+typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
-typedef Lattice<vHalfSpinColourVectorD>  LatticeHalfSpinColourVectorD;
+typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
 typedef Lattice<vHalfSpinColourVectorD2> LatticeHalfSpinColourVectorD2;
 typedef Lattice<vTReal>            LatticeReal;
 typedef Lattice<vTRealF>           LatticeRealF;
@ -359,7 +295,6 @@ typedef Lattice<vTRealD>           LatticeRealD;
 typedef Lattice<vTComplex>         LatticeComplex;
 typedef Lattice<vTComplexF>        LatticeComplexF;
 typedef Lattice<vTComplexD>        LatticeComplexD;
 typedef Lattice<vTComplexD2>       LatticeComplexD2;
 typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
@ -367,42 +302,37 @@ typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
 ///////////////////////////////////////////
 // Physical names for things
 ///////////////////////////////////////////
-typedef LatticeHalfSpinColourVector   LatticeHalfFermion;
+typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
-typedef LatticeHalfSpinColourVectorF  LatticeHalfFermionF;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
-typedef LatticeHalfSpinColourVectorD  LatticeHalfFermionD;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
 typedef LatticeHalfSpinColourVectorD2 LatticeHalfFermionD2;
 typedef LatticeSpinColourVector      LatticeFermion;
 typedef LatticeSpinColourVectorF     LatticeFermionF;
 typedef LatticeSpinColourVectorD     LatticeFermionD;
 typedef LatticeSpinColourVectorD2    LatticeFermionD2;
 typedef LatticeSpinColourMatrix                LatticePropagator;
 typedef LatticeSpinColourMatrixF               LatticePropagatorF;
 typedef LatticeSpinColourMatrixD               LatticePropagatorD;
 typedef LatticeSpinColourMatrixD2              LatticePropagatorD2;
 typedef LatticeLorentzColourMatrix             LatticeGaugeField;
 typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
 typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;
 typedef LatticeLorentzColourMatrixD2           LatticeGaugeFieldD2;
 typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
 typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
 typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;
 typedef LatticeDoubleStoredColourMatrixD2      LatticeDoubledGaugeFieldD2;
 template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
 // Uhgg... typing this hurt  ;)
 // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
 typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
 typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
 typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    
 typedef Lattice<vColourVectorD2>        LatticeStaggeredFermionD2;    
 typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
 typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
 typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 
 typedef Lattice<vColourMatrixD2>        LatticeStaggeredPropagatorD2; 
 //////////////////////////////////////////////////////////////////////////////
 // Peek and Poke named after physics attributes
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -34,96 +34,16 @@ directory
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////
 // Smart configuration base class
 ///////////////////////////////////
 template< class Field >
 class ConfigurationBase
 {
 public:
  ConfigurationBase() {}
  virtual ~ConfigurationBase() {}
  virtual void set_Field(Field& U) =0;
  virtual void smeared_force(Field&) = 0;
  virtual Field& get_SmearedU() =0;
  virtual Field &get_U(bool smeared = false) = 0;
 };
 template <class GaugeField >
 class Action 
 {
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  RealD Fdt_norm_sum;
  RealD Fdt_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_norm_sum = deriv_max_sum=0.0;
    Fdt_max_sum =  Fdt_norm_sum = 0.0;
    deriv_num=0;
  }
  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
    if ( max > deriv_max_sum ) {
      deriv_max_sum=max;
    }
    deriv_norm_sum+=nrm;
    if ( Fdt_max > Fdt_max_sum ) {
      Fdt_max_sum=Fdt_max;
    }
    Fdt_norm_sum+=Fdt_nrm; deriv_num++;
  }
  RealD deriv_max_average(void)       { return deriv_max_sum; };
  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
  RealD Fdt_max_average(void)         { return Fdt_max_sum; };
  RealD Fdt_norm_average(void)        { return Fdt_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return S_us; };
  RealD refresh_timer(void)      { return refresh_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  /////////////////////////////
  // Heatbath?
  /////////////////////////////
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  /////////////////////////////////////////////////////////////
  // virtual smeared interface through configuration container
  /////////////////////////////////////////////////////////////
  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
  {
    refresh(U.get_U(is_smeared),sRNG,pRNG);
  }
  virtual RealD S(ConfigurationBase<GaugeField>& U)
  {
    return S(U.get_U(is_smeared));
  }
  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
  {
    return Sinitial(U.get_U(is_smeared));
  }
  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
  {
    deriv(U.get_U(is_smeared),dSdU); 
    if ( is_smeared ) {
      U.smeared_force(dSdU);
    }
  }
  ///////////////////////////////
  // Logging
  ///////////////////////////////
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
  virtual ~Action(){}
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@ -30,8 +30,6 @@ directory
 #ifndef QCD_ACTION_CORE
 #define QCD_ACTION_CORE
 #include <Grid/qcd/action/gauge/GaugeImplementations.h>
 #include <Grid/qcd/action/ActionBase.h>
 NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
@ -39,10 +37,6 @@ NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
 NAMESPACE_CHECK(ActionParams);
 #include <Grid/qcd/action/filters/MomentumFilter.h>
 #include <Grid/qcd/action/filters/DirichletFilter.h>
 #include <Grid/qcd/action/filters/DDHMCFilter.h>
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -34,45 +34,27 @@ directory
 NAMESPACE_BEGIN(Grid);
-
+// These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
  Coordinate twists;
-                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
+  GparityWilsonImplParams() : twists(Nd, 0) {};
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  GparityWilsonImplParams() : twists(Nd, 0) {
    dirichlet.resize(0);
    partialDirichlet=0;
  };
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    dirichlet.resize(0);
    partialDirichlet=0;
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    partialDirichlet=0;
    dirichlet.resize(0);
  }
 };
 struct StaggeredImplParams {
-  Coordinate dirichlet; // Blocksize of dirichlet BCs
+  StaggeredImplParams()  {};
  int  partialDirichlet;
  StaggeredImplParams()
  {
    partialDirichlet=0;
    dirichlet.resize(0);
  };
 };
  struct OneFlavourRationalParams : Serializable {
@ -81,11 +63,9 @@ struct StaggeredImplParams {
 				    RealD, hi, 
 				    int,   MaxIter, 
 				    RealD, tolerance, 
 				    RealD, mdtolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq,
+				    int,   BoundsCheckFreq);
 				    RealD, BoundsCheckTol);
  // MaxIter and tolerance, vectors??
@ -96,62 +76,16 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20,
+				int _BoundsCheckFreq=20)
 				RealD mdtol    = 1.0e-6,
 				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
        mdtolerance(mdtol),
 	degree(_degree),
        precision(_precision),
-        BoundsCheckFreq(_BoundsCheckFreq),
+        BoundsCheckFreq(_BoundsCheckFreq){};
        BoundsCheckTol(_BoundsCheckTol){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -71,7 +71,6 @@ public:
  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
    mass_plus=mass_minus=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
@ -183,6 +182,16 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
  void CayleyReport(void);
  void CayleyZeroCounters(void);
  double M5Dflops;
  double M5Dcalls;
  double M5Dtime;
  double MooeeInvFlops;
  double MooeeInvCalls;
  double MooeeInvTime;
 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@ -140,7 +140,6 @@ public:
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD2>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
--- a/Grid/qcd/action/fermion/DWFSlow.h
+++ b/Grid/qcd/action/fermion/DWFSlow.h
@ -1,291 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DWFSlow.h
 Copyright (C) 2022
 Author: Peter Boyle <pboyle@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template <class Impl>
 class DWFSlowFermion : public FermionOperator<Impl>
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
  GridBase *GaugeGrid(void) { return _grid4; }
  GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; }
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
  virtual void  M(const FermionField &in, FermionField &out)
  {
    FermionField tmp(_grid);
    out = (5.0 - M5) * in;
    Dhop(in,tmp,DaggerNo);
    out = out + tmp;
  }
  virtual void  Mdag(const FermionField &in, FermionField &out)
  {
    FermionField tmp(_grid);
    out = (5.0 - M5) * in;
    Dhop(in,tmp,DaggerYes);
    out = out + tmp;
  };
  /////////////////////////////////////////////////////////
  // half checkerboard operations 5D redblack so just site identiy
  /////////////////////////////////////////////////////////
  void Meooe(const FermionField &in, FermionField &out)
  {
    if ( in.Checkerboard() == Odd ) {
      this->DhopEO(in,out,DaggerNo);
    } else {
      this->DhopOE(in,out,DaggerNo);
    }
  }
  void MeooeDag(const FermionField &in, FermionField &out)
  {
    if ( in.Checkerboard() == Odd ) {
      this->DhopEO(in,out,DaggerYes);
    } else {
      this->DhopOE(in,out,DaggerYes);
    }
  };
  // allow override for twisted mass and clover
  virtual void Mooee(const FermionField &in, FermionField &out)
  {
    out = (5.0 - M5) * in;
  }
  virtual void MooeeDag(const FermionField &in, FermionField &out)
  {
    out = (5.0 - M5) * in;
  }
  virtual void MooeeInv(const FermionField &in, FermionField &out)
  {
    out = (1.0/(5.0 - M5)) * in;
  };
  virtual void MooeeInvDag(const FermionField &in, FermionField &out)
  {
    out = (1.0/(5.0 - M5)) * in;
  };
  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) {} ;
  ////////////////////////
  // Derivative interface
  ////////////////////////
  // Interface calls an internal routine
  void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)  { assert(0);};
  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
  ///////////////////////////////////////////////////////////////
  // non-hermitian hopping term; half cb or both
  ///////////////////////////////////////////////////////////////
  void Dhop(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    Dhop5(in,out,MassField,MassField,dag );
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  void DhopOE(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    assert(in.Checkerboard()==Even);
    Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  void DhopEO(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    assert(in.Checkerboard()==Odd);
    Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );  
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  ///////////////////////////////////////////////////////////////
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
  void MdirAll(const FermionField &in, std::vector<FermionField> &out)   { assert(0);};
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out)    { assert(0);};
  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
  void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
  {
    RealD     sgn= 1.0;
    if (dag ) sgn=-1.0;
    Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaX,
 			 Gamma::Algebra::GammaY,
 			 Gamma::Algebra::GammaZ,
 			 Gamma::Algebra::GammaT
    };
    //    mass is  1,1,1,1,-m has to multiply the round the world term
    FermionField tmp (in.Grid());
    tmp = U5e * Cshift(in,mu+1,1);
    out = tmp - Gamma(Gmu[mu])*tmp*sgn;
    tmp = Cshift(adj(U5o)*in,mu+1,-1);
    out = out + tmp + Gamma(Gmu[mu])*tmp*sgn;
    out = -0.5*out;
  };
  void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag)
  {
    // Mass term.... must multiple the round world with mass = 1,1,1,1, -m
    RealD     sgn= 1.0;
    if (dag ) sgn=-1.0;
    Gamma G5(Gamma::Algebra::Gamma5);
    FermionField tmp (in.Grid());
    tmp = massE*Cshift(in,0,1);
    out = tmp - G5*tmp*sgn;
    tmp = Cshift(massO*in,0,-1);
    out = out + tmp + G5*tmp*sgn;
    out = -0.5*out;
  };
  // Constructor
  DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 		 GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5)
    :
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
    _grid4(_Umu.Grid()),
    Umu(Nd,&Fgrid),
    UmuEven(Nd,&Hgrid),
    UmuOdd(Nd,&Hgrid),
    MassField(&Fgrid),
    MassFieldEven(&Hgrid),
    MassFieldOdd(&Hgrid),
    M5(_M5),
    mass(_mass),
    _tmp(&Hgrid)
    {
      Ls=Fgrid._fdimensions[0];
      ImportGauge(_Umu);
      typedef typename FermionField::scalar_type scalar;
      Lattice<iScalar<vInteger> > coor(&Fgrid);
      LatticeCoordinate(coor, 0); // Scoor
      ComplexField one(&Fgrid);
      MassField =scalar(-mass);
      one       =scalar(1.0);
      MassField =where(coor==Integer(Ls-1),MassField,one);
      for(int mu=0;mu<Nd;mu++){
 	pickCheckerboard(Even,UmuEven[mu],Umu[mu]);
 	pickCheckerboard(Odd ,UmuOdd[mu],Umu[mu]);
      }
      pickCheckerboard(Even,MassFieldEven,MassField);
      pickCheckerboard(Odd ,MassFieldOdd,MassField);
    }
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu4)
  {
    GaugeLinkField U4(_grid4);
    for(int mu=0;mu<Nd;mu++){
      U4 = PeekIndex<LorentzIndex>(_Umu4, mu);
      for(int s=0;s<this->Ls;s++){
 	InsertSlice(U4,Umu[mu],s,0);
      }
    }
  }
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
 public:
  virtual RealD Mass(void) { return mass; }
  virtual int   isTrivialEE(void) { return 1; };
  RealD mass;
  RealD M5;
  int Ls;
  GridBase *_grid4;
  GridBase *_grid;
  GridBase *_cbgrid4;
  GridBase *_cbgrid;
  // Copy of the gauge field , with even and odd subsets
  std::vector<GaugeLinkField> Umu;
  std::vector<GaugeLinkField> UmuEven;
  std::vector<GaugeLinkField> UmuOdd;
  ComplexField MassField;
  ComplexField MassFieldEven;
  ComplexField MassFieldOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
                                PropagatorField &phys_src,
                                Current curr_type,
                                unsigned int mu){}
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
                           PropagatorField &phys_src,
                           Current curr_type,
                           unsigned int mu,
                           unsigned int tmin,
 			   unsigned int tmax,
 			   ComplexField &lattice_cmplx){}
 };
 typedef DWFSlowFermion<WilsonImplF> DWFSlowFermionF;
 typedef DWFSlowFermion<WilsonImplD> DWFSlowFermionD;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -47,7 +47,6 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/DWFSlow.h>       // Slow DWF
 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(Wilson);
@ -113,31 +112,28 @@ NAMESPACE_CHECK(DWFutils);
 // Cayley 5d
 NAMESPACE_BEGIN(Grid);
-typedef WilsonFermion<WilsonImplD2> WilsonFermionD2;
+typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 //typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
 //typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
 //typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
 typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
 typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
 // Sp(2n)
 typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF;
 typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD;
 typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF;
 typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD;
 typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD;
 // Twisted mass fermion
-typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
+typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
@ -145,20 +141,23 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
 template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
-typedef WilsonClover<WilsonImplD2> WilsonCloverFermionD2;
+typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
 typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
 typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonExpClover<WilsonImplD2> WilsonExpCloverFermionD2;
+typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
 typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
 typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
 typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
 typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
 typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
@ -166,108 +165,161 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
-typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
+typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
-typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
+typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
 typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
 typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef DomainWallFermion<WilsonImplD2> DomainWallFermionD2;
-typedef DomainWallEOFAFermion<WilsonImplD2> DomainWallEOFAFermionD2;
+//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
 //typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 //typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
-typedef MobiusFermion<WilsonImplD2> MobiusFermionD2;
+//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
 //typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 //typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
-typedef MobiusEOFAFermion<WilsonImplD2> MobiusEOFAFermionD2;
+//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
 //typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 //typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
-typedef ZMobiusFermion<ZWilsonImplD2> ZMobiusFermionD2;
+//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
 //typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 //typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
-typedef ScaledShamirFermion<WilsonImplD2> ScaledShamirFermionD2;
+//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
 //typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 //typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 // Ls vectorised
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
-typedef MobiusZolotarevFermion<WilsonImplD2> MobiusZolotarevFermionD2;
+typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
 typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
 typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
-typedef ShamirZolotarevFermion<WilsonImplD2> ShamirZolotarevFermionD2;
+typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
 typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
 typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
-typedef OverlapWilsonCayleyTanhFermion<WilsonImplD2> OverlapWilsonCayleyTanhFermionD2;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
-typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD2> OverlapWilsonCayleyZolotarevFermionD2;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
 // Continued fraction
-typedef OverlapWilsonContFracTanhFermion<WilsonImplD2> OverlapWilsonContFracTanhFermionD2;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
-typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD2> OverlapWilsonContFracZolotarevFermionD2;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
 // Partial fraction
-typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD2> OverlapWilsonPartialFractionTanhFermionD2;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
-typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD2> OverlapWilsonPartialFractionZolotarevFermionD2;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
 // Gparity cases; partial list until tested
 typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 //typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
 //typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
 //typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
 typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
-typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionD2;
+//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
 //typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 //typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
-typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionD2;
+//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
 //typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 //typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
-typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionD2;
+//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
 //typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
 //typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
 typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
-typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionD2;
+//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
 //typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 //typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 //typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 //typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 //typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@ -49,8 +49,6 @@ public:
  virtual FermionField &tmp(void) = 0;
  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@ -30,18 +30,6 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@ -125,7 +113,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@ -151,10 +139,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
+
      if ( sl == 2 ) {
-	//Only do the twist for lanes on the edge of the physical node
+       
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@ -209,19 +197,6 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@ -232,19 +207,14 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-
+        
-    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    for(int mu=0;mu<Nd;mu++){
-    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+          
-    for(int mu=0;mu<Nd-1;mu++){
+      LatticeCoordinate(coor,mu);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@ -259,7 +229,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	});
+	  });
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@ -290,38 +260,6 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@ -360,48 +298,28 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls=Btilde.Grid()->_fdimensions[0];
    {
      GridBase *GaugeGrid = mat.Grid();
      Lattice<iScalar<vInteger> > coor(GaugeGrid);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      autoView( mat_v , mat, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls = Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
      autoView( tmp_v , tmp, CpuWrite);
      autoView( Atilde_v , Atilde, CpuRead);
      autoView( Btilde_v , Btilde, CpuRead);
      thread_for(ss,tmp.Grid()->oSites(),{
 	  for (int s = 0; s < Ls; s++) {
 	    int sF = s + Ls * ss;
 	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
 	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
 	  }
 	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -47,6 +47,18 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -52,6 +52,18 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -47,6 +47,18 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -32,218 +32,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////
 // Wilson compressor will need FaceGather policies for:
 // Periodic, Dirichlet, and partial Dirichlet for DWF
 ///////////////////////////////////////////////////////////////
 const int dwf_compressor_depth=2;
 #define DWF_COMPRESS
 class FaceGatherPartialDWF
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
 				   int off,int so,int partial)
  {
    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
    //  Shrinks local and remote comms buffers
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else 
    int depth=Ls/2;
 #endif
    std::pair<int,int> *table_v = & table[0];
    auto rhs_v = rhs.View(AcceleratorRead);
    int vol=table.size()/Ls;
    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
 	Integer i=idx/Ls;
 	Integer s=idx%Ls;
 	Integer sc=depth+s-(Ls-depth);
 	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
 	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
    });
    rhs_v.ViewClose();
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    auto Ls = dd.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth=Ls/2;
 #endif    
    // Just pass in the Grid
    auto kp = dd.kernel_p;
    auto mp = dd.mpi_p;
    int size= dd.buffer_size;
    int vol= size/Ls;
    accelerator_forNB(o,size,1,{
 	int idx=o/Ls;
 	int   s=o%Ls;
 	if ( s < depth ) {
 	  int oo=s*vol+idx;
 	  kp[o]=mp[oo];
 	} else if ( s >= Ls-depth ) {
 	  int sc = depth + s - (Ls-depth);
 	  int oo=sc*vol+idx; 
 	  kp[o]=mp[oo];
 	} else {
 	  kp[o] = Zero();//fill rest with zero if partial dirichlet
 	}
    });
  }
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Need to gather *interior portions* for ALL s-slices in simd directions
  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    // insertion of zeroes...
    assert( (table.size()&0x1)==0);
    int num=table.size()/2;
    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
    auto rhs_v = rhs.View(AcceleratorRead);
    auto p0=&pointers[0][0];
    auto p1=&pointers[1][0];
    auto tp=&table[0];
    int nnum=num/Ls;
    accelerator_forNB(j, num, vobj::Nsimd(), {
 	//  Reorders both local and remote comms buffers
 	//  
 	int s  = j % Ls;
 	int sp1 = (s+depth)%Ls;  // peri incremented s slice
 	int hxyz= j/Ls;
 	int xyz0= hxyz*2; // xyzt part of coor
 	int xyz1= hxyz*2+1;
 	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
 	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
 	compress.CompressExchange(p0[jj],p1[jj],
 				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
 				  rhs_v[so+tp[kk1 ].second], 
 				  type);
    });
    rhs_v.ViewClose();
  }
  // Merge routine is for SIMD faces
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    auto Ls = mm.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
    auto mp = &mm.mpointer[0];
    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
    auto vp1= &mm.vpointers[1][0];
    auto type= mm.type;
    int nnum = num/Ls;
    accelerator_forNB(o,num,Merger::Nsimd,{
 	int  s=o%Ls;
 	int hxyz=o/Ls; // xyzt related component
 	int xyz0=hxyz*2;
 	int xyz1=hxyz*2+1;
 	int sp = (s+depth)%Ls; 
 	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int oo0= s+xyz0*Ls;
 	int oo1= s+xyz1*Ls;
 	// same ss0, ss1 pair goes to new layout
 	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
      });
  }
 };
 class FaceGatherDWFMixedBCs
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else 
  static int PartialCompressionFactor(GridBase *grid) {return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
 					 int off,int so,int partial)
  {
    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
  }
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    int partial = mm.partial;
    //    std::cout << " merge DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
    else           FaceGatherSimple::MergeFace(decompress,mm);
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    int partial = dd.partial;
    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
    else           FaceGatherSimple::DecompressFace(decompress,dd);
  }
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
-// optimised versions supporting half precision too??? Deprecate
+// optimised versions supporting half precision too
 /////////////////////////////////////////////////////////////////////////////////////////////
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
 class WilsonCompressorTemplate;
 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
+class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
-//  : public FaceGatherSimple
+				typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
 public:
@ -280,81 +79,172 @@ public:
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Exchange(SiteHalfSpinor &mp0,
+  accelerator_inline void Exchange(SiteHalfSpinor *mp,
-				   SiteHalfSpinor &mp1,
+				   const SiteHalfSpinor * __restrict__ vp0,
-				   const SiteHalfSpinor & vp0,
+				   const SiteHalfSpinor * __restrict__ vp1,
-				   const SiteHalfSpinor & vp1,
+				   Integer type,Integer o) const {
 				   Integer type) const {
 #ifdef GRID_SIMT
-    exchangeSIMT(mp0,mp1,vp0,vp1,type);
+    exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
 #else
    SiteHalfSpinor tmp1;
    SiteHalfSpinor tmp2;
-    exchange(tmp1,tmp2,vp0,vp1,type);
+    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
-    vstream(mp0,tmp1);
+    vstream(mp[2*o  ],tmp1);
-    vstream(mp1,tmp2);
+    vstream(mp[2*o+1],tmp2);
 #endif
  }
-  
+
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Decompress(SiteHalfSpinor &out,
+  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
-				     SiteHalfSpinor &in) const {    
+				     SiteHalfSpinor * __restrict__ in, Integer o) const {    
-    out = in;
+    assert(0);
  }
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  accelerator_inline void CompressExchange(SiteHalfSpinor &out0,
+  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
-					   SiteHalfSpinor &out1,
+					   SiteHalfSpinor * __restrict__ out1,
-					   const SiteSpinor &in0,
+					   const SiteSpinor * __restrict__ in,
-					   const SiteSpinor &in1,
+					   Integer j,Integer k, Integer m,Integer type) const
 					   Integer type) const
  {
 #ifdef GRID_SIMT
    typedef SiteSpinor vobj;
    typedef SiteHalfSpinor hvobj;
-    typedef decltype(coalescedRead(in0))    sobj;
+    typedef decltype(coalescedRead(*in))    sobj;
-    typedef decltype(coalescedRead(out0)) hsobj;
+    typedef decltype(coalescedRead(*out0)) hsobj;
    constexpr unsigned int Nsimd = vobj::Nsimd();
    unsigned int mask = Nsimd >> (type + 1);
    int lane = acceleratorSIMTlane(Nsimd);
    int j0 = lane &(~mask); // inner coor zero
    int j1 = lane |(mask) ; // inner coor one
-    const vobj *vp0 = &in0;
+    const vobj *vp0 = &in[k];  // out0[j] = merge low bit of type from in[k] and in[m] 
-    const vobj *vp1 = &in1;
+    const vobj *vp1 = &in[m];  // out1[j] = merge hi  bit of type from in[k] and in[m]
-    const vobj *vp = (lane&mask) ? vp1:vp0;
+    const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
-    auto sa = coalescedRead(*vp,j0);
+    auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
-    auto sb = coalescedRead(*vp,j1);
+    auto sb = coalescedRead(*vp,j1); // lane to read for out 1
    hsobj psa, psb;
-    projector::Proj(psa,sa,mu,dag);
+    projector::Proj(psa,sa,mu,dag);  // spin project the result0
-    projector::Proj(psb,sb,mu,dag);
+    projector::Proj(psb,sb,mu,dag);  // spin project the result1
-    coalescedWrite(out0,psa);
+    coalescedWrite(out0[j],psa);
-    coalescedWrite(out1,psb);
+    coalescedWrite(out1[j],psb);
 #else
    SiteHalfSpinor temp1, temp2;
    SiteHalfSpinor temp3, temp4;
-    projector::Proj(temp1,in0,mu,dag);
+    projector::Proj(temp1,in[k],mu,dag);
-    projector::Proj(temp2,in1,mu,dag);
+    projector::Proj(temp2,in[m],mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
-    vstream(out0,temp3);
+    vstream(out0[j],temp3);
-    vstream(out1,temp4);
+    vstream(out1[j],temp4);
 #endif
  }
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  accelerator_inline bool DecompressionStep(void) const {
+  accelerator_inline bool DecompressionStep(void) const { return false; }
    return false;
  }
 };
 #if 0
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 				typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
 public:
  int mu,dag;  
  void Point(int p) { mu=p; };
  WilsonCompressorTemplate(int _dag=0){
    dag = _dag;
  }
  typedef _Spinor         SiteSpinor;
  typedef _Hspinor     SiteHalfSpinor;
  typedef _HCspinor SiteHalfCommSpinor;
  typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
  accelerator_inline int CommDatumSize(void) const {
    return sizeof(SiteHalfCommSpinor);
  }
  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
    SiteHalfSpinor hsp;
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
    projector::Proj(hsp,in,mu,dag);
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
  }
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
 #ifdef GRID_SIMT
    typedef decltype(coalescedRead(buf)) sobj;
    sobj sp;
    auto sin = coalescedRead(in);
    projector::Proj(sp,sin,mu,dag);
    coalescedWrite(buf,sp);
 #else
    projector::Proj(buf,in,mu,dag);
 #endif
  }
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Exchange(SiteHalfSpinor *mp,
                       SiteHalfSpinor *vp0,
                       SiteHalfSpinor *vp1,
 		       Integer type,Integer o) const {
    SiteHalfSpinor vt0,vt1;
    SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
    SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
    precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
    precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
    exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
  }
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
  }
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
 			       SiteHalfSpinor *out1,
 			       const SiteSpinor *in,
 			       Integer j,Integer k, Integer m,Integer type) const {
    SiteHalfSpinor temp1, temp2,temp3,temp4;
    SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
    SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
    projector::Proj(temp1,in[k],mu,dag);
    projector::Proj(temp2,in[m],mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
    precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
    precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
  }
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
  accelerator_inline bool DecompressionStep(void) const { return true; }
 };
 #endif
 #define DECLARE_PROJ(Projector,Compressor,spProj)			\
  class Projector {							\
  public:								\
@ -404,7 +294,11 @@ public:
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;
-  //  Vector<int> surface_list;
+  void ZeroCountersi(void)  {  }
  void Reporti(int calls)  {  }
  std::vector<int> surface_list;
  WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
@ -412,11 +306,11 @@ public:
 		const std::vector<int> &distances,Parameters p)  
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
-    //    surface_list.resize(0);
+    ZeroCountersi();
    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
  /*
  void BuildSurfaceList(int Ls,int vol4){
    // find same node for SHM
@ -437,8 +331,7 @@ public:
      }
    }
  }
-  */
+
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
@ -484,26 +377,24 @@ public:
    int dag = compress.dag;
    int face_idx=0;
 #define vet_same_node(a,b) \
      { auto tmp = b;  }
    if ( dag ) { 
-      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
+      assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
-      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
+      assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
-      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
+      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
-      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
+      assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
-      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
+      assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
-      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
+      assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
-      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
+      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
-      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
+      assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
    } else {
-      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
+      assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
-      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
+      assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
-      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
+      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
-      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
+      assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
-      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
+      assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
-      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
+      assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
-      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
+      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
-      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
+      assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -74,6 +74,20 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -75,8 +75,19 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
-  int Dirichlet;
+  void Report(void);
-  Coordinate Block; 
+  void ZeroCounters(void);
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
@ -162,10 +173,7 @@ public:
 		  GridCartesian         &FourDimGrid,
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
-
+    
  virtual void DirichletBlock(const Coordinate & block)
  {
  }
  // Constructors
  /*
    WilsonFermion5D(int simd, 
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid);
 template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
 class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
 public:
-  
+
  static const int Dimension = Representation::Dimension;
  static const bool isFundamental = Representation::isFundamental;
  static const bool LsVectorised=false;
@ -242,13 +242,19 @@ public:
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR;  // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
-typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > WilsonImplD2;  // Double
+
 //typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
 //typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
 //typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
 typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffComplex > ZWilsonImplD2; // Double
 //typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
 //typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
 //typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
 typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
 typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
 typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD;  // Double
@ -261,22 +267,6 @@ typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > W
 typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
 //sp 2n
 typedef WilsonImpl<vComplex,  SpFundamentalRepresentation, CoeffReal > SpWilsonImplR;  // Real.. whichever prec
 typedef WilsonImpl<vComplexF, SpFundamentalRepresentation, CoeffReal > SpWilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, SpFundamentalRepresentation, CoeffReal > SpWilsonImplD;  // Double
 typedef WilsonImpl<vComplex,  SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplR;  // Real.. whichever prec
 typedef WilsonImpl<vComplexF, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplD;  // Double
 typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplR;  // Real.. whichever prec
 typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplD;  // Double
 typedef WilsonImpl<vComplex,  SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplR;  // Real.. whichever prec    // adj = 2indx symmetric for Sp(2N)
 typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplF;  // Float     // adj = 2indx symmetric for Sp(2N)
 typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplD;  // Double    // adj = 2indx symmetric for Sp(2N)
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@ -52,6 +52,13 @@ public:
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:
 #ifdef GRID_SYCL
 #define SYCL_HACK
 #endif  
 #ifdef SYCL_HACK
  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
 			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
 #endif
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -152,6 +152,58 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
  Coordinate latt = GridDefaultLatt();          
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP     = this->_FourDimGrid->_Nprocessors;
  if ( M5Dcalls > 0 ) {
    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
    // Flops = 10.0*(Nc*Ns) *Ls*vol
    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
    // write = 1
    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
  }
  if ( MooeeInvCalls > 0 ) {
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
 #ifdef GRID_CUDA
    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 #else
    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 #endif
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 {
  this->ZeroCounters();
  M5Dflops=0;
  M5Dcalls=0;
  M5Dtime=0;
  MooeeInvFlops=0;
  MooeeInvCalls=0;
  MooeeInvTime=0;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
@ -594,6 +646,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@ -712,7 +765,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
    else          q_out +=     C;
  }
-
+#endif
 }
 template <class Impl>
@ -779,6 +832,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
@ -898,6 +952,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    InsertSlice(L_Q, q_out, s , 0);
  }
 #endif
 }
 #undef Pp
 #undef Pm
@ -905,6 +960,88 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #undef TopRowWithSource
 #if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 						 Vector<iSinglet<Simd> > & Matp,
 						 Vector<iSinglet<Simd> > & Matm)
 {
  int Ls=this->Ls;
  GridBase *grid = this->FermionRedBlackGrid();
  int LLs = grid->_rdimensions[0];
  if ( LLs == Ls ) {
    return; // Not vectorised in 5th direction
  }
  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXcd PplusMat ;
  Eigen::MatrixXcd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
  const int Nsimd=Simd::Nsimd();
  Matp.resize(Ls*LLs);
  Matm.resize(Ls*LLs);
  for(int s2=0;s2<Ls;s2++){
    for(int s1=0;s1<LLs;s1++){
      int istride = LLs;
      int ostride = 1;
      Simd Vp;
      Simd Vm;
      scalar_type *sp = (scalar_type *)&Vp;
      scalar_type *sm = (scalar_type *)&Vm;
      for(int l=0;l<Nsimd;l++){
 	if ( switcheroo<Coeff_t>::iscomplex() ) {
 	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
 	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
 	} else { 
 	  // if real
 	  scalar_type tmp;
 	  tmp = PplusMat (l*istride+s1*ostride,s2);
 	  sp[l] = scalar_type(tmp.real(),tmp.real());
 	  tmp = PminusMat(l*istride+s1*ostride,s2);
 	  sm[l] = scalar_type(tmp.real(),tmp.real());
 	}
      }
      Matp[LLs*s2+s1] = Vp;
      Matm[LLs*s2+s1] = Vm;
    }}
 }
 #endif
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@ -63,18 +63,23 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  uint64_t nloop = grid->oSites();
+  M5Dcalls++;
  M5Dtime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t s = sss%Ls;
+    uint64_t ss= sss*Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1, tmp2;
-    uint64_t idx_u = ss+((s+1)%Ls);
+    for(int s=0;s<Ls;s++){
-    uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      uint64_t idx_u = ss+((s+1)%Ls);
-    spProj5m(tmp1,psi(idx_u));
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
-    spProj5p(tmp2,psi(idx_l));
+      spProj5m(tmp1,psi(idx_u));
-    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+      spProj5p(tmp2,psi(idx_l));
      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
 template<class Impl>  
@ -100,18 +105,23 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  int Ls=this->Ls;
  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  uint64_t nloop = grid->oSites();
+  M5Dcalls++;
  M5Dtime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t s = sss%Ls;
+    uint64_t ss=sss*Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1,tmp2;
-    uint64_t idx_u = ss+((s+1)%Ls);
+    for(int s=0;s<Ls;s++){
-    uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      uint64_t idx_u = ss+((s+1)%Ls);
-    spProj5p(tmp1,psi(idx_u));
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
-    spProj5m(tmp2,psi(idx_l));
+      spProj5p(tmp1,psi(idx_u));
-    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+      spProj5m(tmp2,psi(idx_l));
      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
 template<class Impl>
@ -132,6 +142,8 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  auto pleem = & leem[0];
  auto pueem = & ueem[0];
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -168,6 +180,8 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
  MooeeInvTime+=usecond();
 }
@ -190,6 +204,10 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  assert(psi.Checkerboard() == psi.Checkerboard());
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -226,6 +244,7 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
  MooeeInvTime+=usecond();
 }
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h
@ -94,6 +94,10 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}
  M5Dcalls++;
  M5Dtime-=usecond();
  assert(Nc==3);
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
@ -194,6 +198,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
    }
 #endif
  });
  M5Dtime+=usecond();
 }
 template<class Impl>  
@ -237,6 +242,8 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}
  M5Dcalls++;
  M5Dtime-=usecond();
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
    alignas(64) SiteHalfSpinor hp;
@ -332,6 +339,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
    }
 #endif
  });
  M5Dtime+=usecond();
 }
@ -805,6 +813,9 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
  }
  assert(_Matp->size()==Ls*LLs);
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  if ( switcheroo<Coeff_t>::iscomplex() ) {
    thread_loop( (auto site=0;site<vol;site++),{
      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
@ -814,7 +825,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    });
  }
-
+  MooeeInvTime+=usecond();
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@ -54,6 +54,8 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  auto pupper = &upper[0];
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -69,6 +71,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@ -88,6 +91,8 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -103,6 +108,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@ -121,6 +127,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  auto pleem = & this->leem[0];
  auto pueem = & this->ueem[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -156,6 +164,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
      coalescedWrite(chi[ss+s],res);
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@ -176,6 +185,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
  assert(psi.Checkerboard() == psi.Checkerboard());
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  auto nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -212,6 +223,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
    }
  });
  this->MooeeInvTime += usecond();
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@ -298,33 +298,45 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();
  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime+=usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime-=usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime+=usecond();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  DhopComputeTime2-=usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2+=usecond();
 }
 template<class Impl>
@ -335,14 +347,22 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];
 //double t1=usecond();
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
  DhopCommTime += usecond();
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 }
 /*CHANGE END*/
@ -351,6 +371,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@ -362,6 +383,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@ -373,6 +395,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
@ -381,6 +404,58 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
  Coordinate latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _FourDimGrid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime    = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@ -334,6 +334,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@ -345,6 +346,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@ -357,6 +359,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@ -415,33 +418,47 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
@ -454,16 +471,78 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@ -55,6 +55,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -70,6 +73,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@ -95,6 +99,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  auto pshift_coeffs = &shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@ -115,6 +122,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@ -135,6 +143,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(), {
    uint64_t ss = sss*Ls;
@ -150,6 +161,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@ -173,6 +186,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  auto pshift_coeffs = &shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto pm = this->pm;
  int nloop = grid->oSites()/Ls;
@ -201,6 +217,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@ -220,6 +237,9 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -257,6 +277,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@ -276,6 +297,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  auto pueem= & this->ueem[0];
  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -320,6 +343,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
      }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@ -339,6 +363,9 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  auto pleem= & this->leem[0];
  auto pueem= & this->ueem[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@ -375,6 +402,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
      coalescedWrite(chi[ss+s],res);
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@ -395,6 +423,9 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
      uint64_t ss=sss*Ls;
@ -438,6 +469,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
      }
  });
  this->MooeeInvTime += usecond();
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@ -263,6 +263,7 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@ -274,6 +275,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@ -286,6 +288,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@ -342,33 +345,47 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
 template <class Impl>
@ -379,16 +396,78 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -60,13 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid),
+  _tmp(&FiveDimRedBlackGrid)
  Dirichlet(0)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
@ -96,19 +91,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
  }
  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
      Dirichlet = 1;
      std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
      std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
      Block = block;
    }
  } else {
    Coordinate block(Nd+1,0);
    Block = block;
  }
  if (Impl::LsVectorised) { 
    int nsimd = Simd::Nsimd();
@ -143,38 +125,99 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
  RealD NP     = _FourDimGrid->_Nprocessors;
  RealD NN     = _FourDimGrid->NodeCount();
  RealD volume = Ls;  
  Coordinate latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _FourDimGrid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::ZeroCounters(void) {
  DhopCalls       = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0;
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
    if ( this->Params.partialDirichlet ) {
      std::cout << GridLogMessage << " partialDirichlet BCs " <<Block<<std::endl;
    } else {
      std::cout << GridLogMessage << " FULL Dirichlet BCs " <<Block<<std::endl;
    }
    std:: cout << GridLogMessage << "Checking block size multiple of rank boundaries for Dirichlet"<<std::endl;
    for(int d=0;d<Nd;d++) {
      int GaugeBlock = Block[d+1];
      int ldim=GaugeGrid()->LocalDimensions()[d];
      if (GaugeBlock) assert( (GaugeBlock%ldim)==0);
    }
    if (!this->Params.partialDirichlet) {
      std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " <<Block<<std::endl;
      Coordinate GaugeBlock(Nd);
      for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
      DirichletFilter<GaugeField> Filter(GaugeBlock);
      Filter.applyFilter(HUmu);
    } else {
      std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " NOT filtered gauge field" <<std::endl;
    }
  }
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
@ -216,6 +259,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  const FermionField &B,
 					  int dag)
 {
  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));
  conformable(st.Grid(),A.Grid());
@ -226,12 +270,15 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());
  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
  DerivCommTime+=usecond();
  Atilde=A;
  int LLs = B.Grid()->_rdimensions[0];
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
@ -243,6 +290,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    // Call the single hop
    ////////////////////////
    DerivDhopComputeTime -= usecond();
    int Usites = U.Grid()->oSites();
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
@ -250,8 +299,10 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
    DerivDhopComputeTime += usecond();
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
  DerivComputeTime += usecond();
 }
 template<class Impl>
@ -309,10 +360,12 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@ -321,7 +374,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalOverlappedComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
@ -330,57 +382,53 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
-  {
+  DhopFaceTime-=usecond();
-    GRID_TRACE("Gather");
+  st.HaloExchangeOptGather(in,compressor);
-    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
+  DhopFaceTime+=usecond();
-  }
+
-  
+  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  auto id=traceStart("Communicate overlapped");
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
+  DhopFaceTime-=usecond();
-    GRID_TRACE("MergeSHM");
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
  }
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  traceStop(id);
+  DhopCommTime   +=usecond();
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
-  {
+  DhopFaceTime-=usecond();
-    GRID_TRACE("Merge");
+  st.CommsMerge(compressor);
-    st.CommsMerge(compressor);
+  DhopFaceTime+=usecond();
  }
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 }
@ -390,30 +438,29 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
 						    const FermionField &in, 
 						    FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalSerialComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
  }
  DhopCommTime-=usecond();
  st.HaloExchangeOpt(in,compressor);
  DhopCommTime+=usecond();
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@ -425,6 +472,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@ -436,6 +484,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
@ -490,17 +539,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  LatComplex    sk(_grid);  sk = Zero();
  LatComplex    sk2(_grid); sk2= Zero();
  LatComplex    W(_grid); W= Zero();
  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
  LatComplex 	cosha(_grid);
  LatComplex 	kmu(_grid);
  LatComplex 	Wea(_grid);
  LatComplex 	Wema(_grid);
  LatComplex 	ea(_grid);
  LatComplex 	ema(_grid);
  LatComplex 	eaLs(_grid);
  LatComplex 	emaLs(_grid);
  LatComplex 	ea2Ls(_grid);
  LatComplex 	ema2Ls(_grid);
  LatComplex 	sinha(_grid);
  LatComplex 	sinhaLs(_grid);
  LatComplex 	coshaLs(_grid);
@ -535,29 +579,39 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  ////////////////////////////////////////////
  cosha = (one + W*W + sk) / (abs(W)*2.0);
-  ea = (cosha + sqrt(cosha*cosha-one));
+  // FIXME Need a Lattice acosh
-  ema= (cosha - sqrt(cosha*cosha-one));
+
-  eaLs = pow(ea,Ls);
+  {
-  emaLs= pow(ema,Ls);
+    autoView(cosha_v,cosha,CpuRead);
-  ea2Ls = pow(ea,2.0*Ls);
+    autoView(a_v,a,CpuWrite);
-  ema2Ls= pow(ema,2.0*Ls);
+    for(int idx=0;idx<_grid->lSites();idx++){
-  Wea= abs(W) * ea;
+      Coordinate lcoor(Nd);
-  Wema= abs(W) * ema;
+      Tcomplex cc;
-  //  a=log(ea);
+      //    RealD sgn;
-  
+      _grid->LocalIndexToLocalCoor(idx,lcoor);
-  sinha = 0.5*(ea - ema);
+      peekLocalSite(cc,cosha_v,lcoor);
-  sinhaLs = 0.5*(eaLs-emaLs);
+      assert((double)real(cc)>=1.0);
-  coshaLs = 0.5*(eaLs+emaLs);
+      assert(fabs((double)imag(cc))<=1.0e-15);
      cc = ScalComplex(::acosh(real(cc)),0.0);
      pokeLocalSite(cc,a_v,lcoor);
    }
  }
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
  sinha = 0.5*(exp( a) - exp(-a));
  sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
  coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
-  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
+  F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
-  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
+  F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
  F = F - abs(W) * sinha * 4.0 * mass;
-  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
+  Bpp =  (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
-  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
+  Bmm =  (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
-  App =  (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one);
+  App =  (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
-  Amm =  (A/F) * (one - ea2Ls)  * ea  * (ea  - abs(W)) * (one - mass*mass * one);
+  Amm =  (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
  ABpm = (A/F) * abs(W) * sinha * 2.0  * (one + mass * coshaLs * 2.0 + mass*mass * one);
  //P+ source, P- source
@ -580,29 +634,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
      buf1_4d = Zero();
      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
      //G(s,t)
-      bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d;
+      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
      //A++*exp(a(s+t))
-      bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
      //A+-*exp(a(s-t))
-      bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
      //A-+*exp(a(-s+t))
-      bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
      //A--*exp(a(-s-t))
-      bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
      //GL
      buf2_4d = Zero();
      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
      //G(s,t)
-      bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d;
+      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
      //B++*exp(a(s+t))
-      bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
      //B+-*exp(a(s-t))
-      bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
      //B-+*exp(a(-s+t))
-      bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
      //B--*exp(a(-s-t))
-      bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
    }
    InsertSlice(bufR_4d, GR, (ss-1), 0);
    InsertSlice(bufL_4d, GL, (ss-1), 0);
@ -721,12 +775,28 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
  W = one - M5 + sk2;
  ////////////////////////////////////////////
-  // Cosh alpha -> exp(+/- alpha)
+  // Cosh alpha -> alpha
  ////////////////////////////////////////////
  cosha =  (one + W*W + sk) / (abs(W)*2.0);
-  Wea = abs(W)*(cosha + sqrt(cosha*cosha-one));
+  // FIXME Need a Lattice acosh
-  Wema= abs(W)*(cosha - sqrt(cosha*cosha-one));
+  {
  autoView(cosha_v,cosha,CpuRead);
  autoView(a_v,a,CpuWrite);
  for(int idx=0;idx<_grid->lSites();idx++){
    Coordinate lcoor(Nd);
    Tcomplex cc;
    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(cc,cosha_v,lcoor);
    assert((double)real(cc)>=1.0);
    assert(fabs((double)imag(cc))<=1.0e-15);
    cc = ScalComplex(::acosh(real(cc)),0.0);
    pokeLocalSite(cc,a_v,lcoor);
  }}
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
  num   = num + ( one - Wema ) * mass * in;
  denom= ( Wea - one ) + mass*mass * (one - Wema); 
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -60,9 +60,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
@ -79,6 +76,91 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  StencilOdd.BuildSurfaceList(1,vol4);
 }
 template<class Impl>
 void WilsonFermion<Impl>::Report(void)
 {
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  RealD volume = 1;
  Coordinate latt = _grid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _grid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    // how to count flops here?
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call               ? : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node      ? : " << mflops/NP << std::endl;
    // how to count flops here?
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)        ? : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion<Impl>::ZeroCounters(void) {
  DhopCalls       = 0; // ok
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0; // ok
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@ -238,6 +320,7 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
  DerivCalls++;
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@ -246,8 +329,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Atilde(B.Grid());
  Atilde = A;
  DerivCommTime-=usecond();
  st.HaloExchange(B, compressor);
  DerivCommTime+=usecond();
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
@ -255,6 +341,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;
    DerivDhopComputeTime -= usecond();
    int Ls=1;
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@ -262,7 +349,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
    DerivDhopComputeTime += usecond();
  }
  DerivComputeTime += usecond();
 }
 template <class Impl>
@ -309,6 +398,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@ -320,6 +410,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@ -332,6 +423,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@ -396,12 +488,14 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
  DhopTotalTime-=usecond();
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
 #endif
    DhopInternalSerial(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
 template <class Impl>
@ -410,7 +504,6 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag)
 {
  GRID_TRACE("DhopOverlapped");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@ -421,55 +514,53 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  /////////////////////////////
  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
-  {
+  DhopFaceTime-=usecond();
-    GRID_TRACE("Gather");
+  st.HaloGather(in,compressor);
-    st.HaloGather(in,compressor);
+  DhopFaceTime+=usecond();
  }
-  tracePush("Communication");
+  DhopCommTime -=usecond();
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  {
+  DhopFaceTime-=usecond();
-    GRID_TRACE("MergeSHM");
+  st.CommsMergeSHM(compressor);
-    st.CommsMergeSHM(compressor);
+  DhopFaceTime+=usecond();
  }
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  tracePop("Communication");
+  DhopCommTime   +=usecond();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  {
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
  }
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  } else {
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 };
@ -479,22 +570,20 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
  GRID_TRACE("DhopSerial");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
-  {
+  DhopCommTime-=usecond();
-    GRID_TRACE("HaloExchange");
+  st.HaloExchange(in, compressor);
-    st.HaloExchange(in, compressor);
+  DhopCommTime+=usecond();
  }
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 };
 /*Change ends */
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -72,15 +72,20 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  if (SE->_is_local) {						\
    int perm= SE->_permute;					\
    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
-    spProj(chi,tmp);							\
+    spProj(chi,tmp);						\
-    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
+  } else if ( st.same_node[Dir] ) {				\
-    Recon(result, Uchi);						\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
-  }									\
+  }								\
  acceleratorSynchronise();						\
  if (SE->_is_local || st.same_node[Dir] ) {			\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
  }								\
  acceleratorSynchronise();
 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
-  if (!SE->_is_local ) {		\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    auto chi = coalescedRead(buf[SE->_offset],lane);		\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
@ -411,6 +416,19 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }
 #define KERNEL_CALL_TMP(A) \
  const uint64_t    NN = Nsite*Ls;					\
  auto U_p = & U_v[0];							\
  auto in_p = & in_v[0];						\
  auto out_p = & out_v[0];						\
  auto st_p = st_v._entries_p;						\
  auto st_perm = st_v._permute_type;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p);	\
    });									\
  accelerator_barrier();
 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@ -422,34 +440,12 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
 #define KERNEL_CALL_EXT(A)						\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
      int sU = sF/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
    });									\
  accelerator_barrier();
 #define ASM_CALL(A)							\
-  thread_for( sss, Nsite, {						\
+  thread_for( ss, Nsite, {						\
    int ss = st.lo->Reorder(sss);					\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
  });
 #define ASM_CALL_SLICE(A)						\
  auto grid = in.Grid() ;						\
  int nt = grid->LocalDimensions()[4];					\
  int nxyz = Nsite/nt ;							\
  for(int t=0;t<nt;t++){						\
  thread_for( sss, nxyz, {						\
    int ss = t*nxyz+sss;						\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
    });}
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
@ -463,7 +459,11 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifdef SYCL_HACK     
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
 #else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #endif     
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
@ -474,10 +474,8 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
-     acceleratorFenceComputeStream();
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
@ -500,20 +498,21 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
     acceleratorFenceComputeStream();
   } else if( interior ) {
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
     // Dependent on result of merge
     acceleratorFenceComputeStream();
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt);    return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
     acceleratorFenceComputeStream();
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonCloverFermionInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonCloverFermionInstantiationSpWilsonImplD.cc
@ -1 +0,0 @@
 ../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonFermionInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonFermionInstantiationSpWilsonImplD.cc
@ -1 +0,0 @@
 ../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonKernelsInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonKernelsInstantiationSpWilsonImplD.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonTMFermionInstantiationSpWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/WilsonTMFermionInstantiationSpWilsonImplD.cc
@ -1 +0,0 @@
 ../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplD/impl.h
@ -1 +0,0 @@
 #define IMPLEMENTATION SpWilsonImplD
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonCloverFermionInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonCloverFermionInstantiationSpWilsonImplF.cc
@ -1 +0,0 @@
 ../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonFermionInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonFermionInstantiationSpWilsonImplF.cc
@ -1 +0,0 @@
 ../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonKernelsInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonKernelsInstantiationSpWilsonImplF.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonTMFermionInstantiationSpWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/WilsonTMFermionInstantiationSpWilsonImplF.cc
@ -1 +0,0 @@
 ../WilsonTMFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/impl.h
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonImplF/impl.h
@ -1 +0,0 @@
 #define IMPLEMENTATION SpWilsonImplF
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonCloverFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonCloverFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@ -1 +0,0 @@
 ../WilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@ -1 +0,0 @@
 ../WilsonFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonTMFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/SpWilsonTwoIndexAntiSymmetricImplD/WilsonTMFermionInstantiationSpWilsonTwoIndexAntiSymmetricImplD.cc
@ -1 +0,0 @@
 ../WilsonTMFermionInstantiation.cc.master
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`../WilsonCloverFermionInstantiation.cc.master`