Merge branch 'feature/ddhmc' of https://github.com/paboyle/Grid into feature/ddhmc

Several updates
Correct mass
2025-10-29 19:14:33 +00:00 · 2022-02-14 17:33:17 +01:00 · 2022-02-14 17:29:41 +01:00 · 2021-11-17 21:40:04 +00:00 · 2021-10-07 20:06:55 +01:00 · 2021-10-07 20:06:17 +01:00
569 changed files with 23408 additions and 56416 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -1,54 +0,0 @@
-name: Bug report
-description: Report a bug.
-title: "<insert title>"
-labels: [bug]
-
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thank you for taking the time to file a bug report.
-        Please check that the code is pointing to the HEAD of develop
-        or any commit in master which is tagged with a version number.
-
-  - type: textarea
-    attributes:
-      label: "Describe the issue:"
-      description: >
-        Describe the issue and any previous attempt to solve it.
-    validations:
-      required: true
-
-  - type: textarea
-    attributes:
-      label: "Code example:"
-      description: >
-        If relevant, show how to reproduce the issue using a minimal working
-        example.
-      placeholder: |
-        << your code here >>
-      render: shell
-    validations:
-      required: false
-
-  - type: textarea
-    attributes:
-      label: "Target platform:"
-      description: >
-        Give a description of the target platform (CPU, network, compiler).
-        Please give the full CPU part description, using for example
-        `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
-        or `sysctl machdep.cpu.brand_string` (macOS) and the full output
-        the `--version` option of your compiler.
-    validations:
-      required: true
-
-  - type: textarea
-    attributes:
-      label: "Configure options:"
-      description: >
-        Please give the exact configure command used and attach
-        `config.log`, `grid.config.summary` and the output of `make V=1`.
-      render: shell
-    validations:
-      required: true
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -34,6 +34,9 @@ directory

 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+#if defined __GNUC__ && __GNUC__>=6
+#pragma GCC diagnostic ignored "-Wpsabi"
 #endif

 //disables and intel compiler specific warning (in json.hpp)
@@ -44,22 +47,14 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
-
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
- //disables nvcc specific warning in json.hpp
-#pragma nv_diag_suppress unsigned_compare_with_zero
-#pragma nv_diag_suppress cast_to_qualified_type
- //disables nvcc specific warning in many files
-#pragma nv_diag_suppress esa_on_defaulted_function_ignored
-#pragma nv_diag_suppress extra_semicolon
-#else
- //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
+
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-#endif
+
+//Eigen only
 #endif

 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
--- a/Grid/GridCore.h
+++ b/Grid/GridCore.h
@@ -44,10 +44,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
-//#include <Grid/perfmon/PerfCount.h>
+#include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
-#include <Grid/perfmon/Tracing.h>
 #include <Grid/allocator/Allocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/ThreadReduction.h>
--- a/Grid/GridStd.h
+++ b/Grid/GridStd.h
@@ -16,7 +16,6 @@
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
-#include <strings.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -14,11 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#pragma nv_diag_suppress code_is_unreachable
-#else
 #pragma diag_suppress code_is_unreachable
-#endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -66,10 +66,6 @@ if BUILD_FERMION_REPS
  extra_sources+=$(ADJ_FERMION_FILES)
  extra_sources+=$(TWOIND_FERMION_FILES)
 endif
-if BUILD_SP
-    extra_sources+=$(SP_FERMION_FILES)
-    extra_sources+=$(SP_TWOIND_FERMION_FILES)
-endif

 lib_LIBRARIES = libGrid.a

--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -55,7 +55,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
-#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -69,8 +68,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/PowerMethod.h>

 NAMESPACE_CHECK(PowerMethod);
-#include <Grid/algorithms/multigrid/MultiGrid.h>
-
+#include <Grid/algorithms/CoarsenedMatrix.h>
 NAMESPACE_CHECK(CoarsendMatrix);
 #include <Grid/algorithms/FFT.h>

--- a/Grid/algorithms/multigrid/CoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h
@@ -56,6 +56,243 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
  blockSum(CoarseInner,fine_inner_msk);
 }

+
+class Geometry {
+public:
+  int npoint;
+  int base;
+  std::vector<int> directions   ;
+  std::vector<int> displacements;
+  std::vector<int> points_dagger;
+
+  Geometry(int _d)  {
+    
+    base = (_d==5) ? 1:0;
+
+    // make coarse grid stencil for 4d , not 5d
+    if ( _d==5 ) _d=4;
+
+    npoint = 2*_d+1;
+    directions.resize(npoint);
+    displacements.resize(npoint);
+    points_dagger.resize(npoint);
+    for(int d=0;d<_d;d++){
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
+      points_dagger[d   ] = d+_d;
+      points_dagger[d+_d] = d;
+    }
+    directions   [2*_d]=0;
+    displacements[2*_d]=0;
+    points_dagger[2*_d]=2*_d;
+  }
+
+  int point(int dir, int disp) {
+    assert(disp == -1 || disp == 0 || disp == 1);
+    assert(base+0 <= dir && dir < base+4);
+
+    // directions faster index = new indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  1  2  3  0  1  2  3  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  2  3  4  1  2  3  4  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+
+    // displacements faster index = old indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  0  1  1  2  2  3  3  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  1  2  2  3  3  4  4  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+
+    if(dir == 0 and disp == 0)
+      return 8;
+    else // New indexing
+      return (1 - disp) / 2 * 4 + dir - base;
+    // else // Old indexing
+    //   return (4 * (dir - base) + 1 - disp) / 2;
+  }
+};
+  
+template<class Fobj,class CComplex,int nbasis>
+class Aggregation   {
+public:
+  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  GridBase *CoarseGrid;
+  GridBase *FineGrid;
+  std::vector<Lattice<Fobj> > subspace;
+  int checkerboard;
+  int Checkerboard(void){return checkerboard;}
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
+    CoarseGrid(_CoarseGrid),
+    FineGrid(_FineGrid),
+    subspace(nbasis,_FineGrid),
+    checkerboard(_checkerboard)
+  {
+  };
+  
+  void Orthogonalise(void){
+    CoarseScalar InnerProd(CoarseGrid); 
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+  } 
+  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+    blockProject(CoarseVec,FineVec,subspace);
+  }
+  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+    FineVec.Checkerboard() = subspace[0].Checkerboard();
+    blockPromote(CoarseVec,FineVec,subspace);
+  }
+
+  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+
+    RealD scale;
+
+    ConjugateGradient<FineField> CG(1.0e-2,100,false);
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+      
+      subspace[b] = Zero();
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      for(int i=0;i<1;i++){
+
+	CG(hermop,noise,subspace[b]);
+
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=filterlo;
+      noise=Mn;
+
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	autoView( y_v , y, AcceleratorWrite);
+	autoView( Tn_v , (*Tn), AcceleratorWrite);
+	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
+	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
+	const int Nsimd = CComplex::Nsimd();
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
+
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+    assert(b==nn);
+  }
+
+};
+
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
@@ -87,9 +324,9 @@ public:
  GridBase*        _cbgrid;
  int hermitian;

-  CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil; 
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
-  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
+  CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CartesianStencil<siteVector,siteVector,int> StencilEven;
+  CartesianStencil<siteVector,siteVector,int> StencilOdd;

  std::vector<CoarseMatrix> A;
  std::vector<CoarseMatrix> Aeven;
@@ -121,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    int npoint = geom.npoint;
+    auto& geom_v = geom;
    typedef LatticeView<Cobj> Aview;
      
    Vector<Aview> AcceleratorViewContainer;
@@ -143,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int point=0;point<npoint;point++){
+      for(int point=0;point<geom_v.npoint;point++){

 	SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
@@ -187,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    int npoint = geom.npoint;
+    auto& geom_v = geom;
    typedef LatticeView<Cobj> Aview;

    Vector<Aview> AcceleratorViewContainer;
@@ -217,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int p=0;p<npoint;p++){
+      for(int p=0;p<geom_v.npoint;p++){
        int point = points_p[p];

 	SE=Stencil_v.GetEntry(ptype,point,ss);
@@ -394,7 +631,7 @@ public:
    assert(Aself != nullptr);
  }

-  void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
+  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
                       const CoarseVector &in, CoarseVector &out, int dag) {
    int point = geom.npoint-1;
    autoView( out_v, out, AcceleratorWrite);
@@ -457,7 +694,7 @@ public:
    }
  }

-  void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
+  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
                    const CoarseVector &in, CoarseVector &out, int dag) {
    SimpleCompressor<siteVector> compressor;

@@ -547,9 +784,9 @@ public:
    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,_cbgrid),
    Aodd(geom.npoint,_cbgrid),
@@ -567,9 +804,9 @@ public:
    _cbgrid(&CoarseRBGrid),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,&CoarseRBGrid),
    Aodd(geom.npoint,&CoarseRBGrid),
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -52,7 +52,6 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
-  virtual ~LinearOperatorBase(){};
 };


@@ -145,44 +144,6 @@ public:
  }
 };

-////////////////////////////////////////////////////////////////////
-// Create a shifted HermOp
-////////////////////////////////////////////////////////////////////
-template<class Field>
-class ShiftedHermOpLinearOperator : public LinearOperatorBase<Field> {
-  LinearOperatorBase<Field> &_Mat;
-  RealD _shift;
-public:
-  ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
-  // Support for coarsening to a multigrid
-  void OpDiag (const Field &in, Field &out) {
-    assert(0);
-  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
-  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
-  };
-  void Op     (const Field &in, Field &out){
-    assert(0);
-  }
-  void AdjOp     (const Field &in, Field &out){
-    assert(0);
-  }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    HermOp(in,out);
-    ComplexD dot = innerProduct(in,out);
-    n1=real(dot);
-    n2=norm2(out);
-  }
-  void HermOp(const Field &in, Field &out){
-    _Mat.HermOp(in,out);
-    out = out + _shift*in;
-  }
-};
-
-
 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
@@ -262,9 +223,14 @@ class SchurOperatorBase :  public LinearOperatorBase<Field> {
    Mpc(in,tmp);
    MpcDag(tmp,out);
  }
+  virtual  void MpcMpcDag(const Field &in, Field &out) {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = in.Checkerboard();
+    MpcDag(in,tmp);
+    Mpc(tmp,out);
+  }
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    out.Checkerboard() = in.Checkerboard();
-    MpcDagMpc(in,out);
+    HermOp(in,out);
    ComplexD dot= innerProduct(in,out); 
    n1=real(dot);
    n2=norm2(out);
@@ -315,6 +281,16 @@ template<class Matrix,class Field>
      axpy(out,-1.0,tmp,out);
    }
 };
+// Mpc MpcDag system presented as the HermOp
+template<class Matrix,class Field>
+class SchurDiagMooeeDagOperator :  public SchurDiagMooeeOperator<Matrix,Field> {
+ public:
+  virtual void HermOp(const Field &in, Field &out){
+    out.Checkerboard() = in.Checkerboard();
+    this->MpcMpcDag(in,out);
+  }
+  SchurDiagMooeeDagOperator (Matrix &Mat): SchurDiagMooeeOperator<Matrix,Field>(Mat){};
+};
 template<class Matrix,class Field>
  class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
 protected:
@@ -546,7 +522,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out) {
+  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
    assert(0);// Never need with staggered
  }
 };
@@ -564,7 +540,6 @@ public:
      (*this)(Linop,in[k],out[k]);
    }
  };
-  virtual ~OperatorFunction(){};
 };

 template<class Field> class LinearFunction {
@@ -580,7 +555,6 @@ public:
      (*this)(in[i], out[i]);
    }
  }
-  virtual ~LinearFunction(){};
 };

 template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
@@ -626,7 +600,6 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
-  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
      
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -640,7 +613,6 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
-  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
      
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -30,19 +30,13 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Field> using Preconditioner =  LinearFunction<Field> ;
-
-/*
-template<class Field> class Preconditioner :  public LinearFunction<Field> {
-  using LinearFunction<Field>::operator();
+template<class Field> class Preconditioner :  public LinearFunction<Field> { 
  virtual void operator()(const Field &src, Field & psi)=0;
 };
-*/

 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  using Preconditioner<Field>::operator();
-  virtual void operator()(const Field &src, Field & psi){
+  void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -48,7 +48,6 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
-  virtual ~SparseMatrixBase() {};
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -73,7 +72,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-  virtual ~CheckerBoardedSparseMatrixBase() {};
+
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -90,8 +90,9 @@ public:
    order=_order;
      
    if(order < 2) exit(-1);
-    Coeffs.resize(order,0.0);
-    Coeffs[order-1] = 1.0;
+    Coeffs.resize(order);
+    Coeffs.assign(0.,order);
+    Coeffs[order-1] = 1.;
  };
  
  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
@@ -257,12 +258,26 @@ public:
    for(int n=2;n<order;n++){

      Linop.HermOp(*Tn,y);
+#if 0
+      auto y_v = y.View();
+      auto Tn_v = Tn->View();
+      auto Tnp_v = Tnp->View();
+      auto Tnm_v = Tnm->View();
+      constexpr int Nsimd = vector_type::Nsimd();
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+      });
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
-
+#endif
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
@@ -277,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
+
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/approx/MultiShiftFunction.h
+++ b/Grid/algorithms/approx/MultiShiftFunction.h
@@ -40,7 +40,7 @@ public:
  RealD norm;
  RealD lo,hi;

-  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;};
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@@ -33,110 +33,109 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * Script A = SolverMatrix 
   * Script P = Preconditioner
   *
+   * Deflation methods considered
+   *      -- Solve P A x = P b        [ like Luscher ]
+   * DEF-1        M P A x = M P b     [i.e. left precon]
+   * DEF-2        P^T M A x = P^T M b
+   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
+   * ADEF-2       Preconditioner = P^T M + Q
+   * BNN          Preconditioner = P^T M P + Q
+   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
+   * 
   * Implement ADEF-2
   *
   * Vstart = P^Tx + Qb
   * M1 = P^TM + Q
   * M2=M3=1
+   * Vout = x
   */
-NAMESPACE_BEGIN(Grid);

-template<class Field>
-class TwoLevelCG : public LinearFunction<Field>
+// abstract base
+template<class Field, class CoarseField>
+class TwoLevelFlexiblePcg : public LinearFunction<Field>
 {
 public:
+  int verbose;
  RealD   Tolerance;
  Integer MaxIterations;
+  const int mmax = 5;
  GridBase *grid;
+  GridBase *coarsegrid;

-  // Fine operator, Smoother, CoarseSolver
-  LinearOperatorBase<Field>   &_FineLinop;
-  LinearFunction<Field>   &_Smoother;
+  LinearOperatorBase<Field>   *_Linop
+  OperatorFunction<Field>     *_Smoother,
+  LinearFunction<CoarseField> *_CoarseSolver;
+
+  // Need somthing that knows how to get from Coarse to fine and back again
  
  // more most opertor functions
-  TwoLevelCG(RealD tol,
-	     Integer maxit,
-	     LinearOperatorBase<Field>   &FineLinop,
-	     LinearFunction<Field>       &Smoother,
-	     GridBase *fine) : 
+  TwoLevelFlexiblePcg(RealD tol,
+		     Integer maxit,
+		     LinearOperatorBase<Field> *Linop,
+		     LinearOperatorBase<Field> *SmootherLinop,
+		     OperatorFunction<Field>   *Smoother,
+		     OperatorFunction<CoarseField>  CoarseLinop
+		     ) : 
      Tolerance(tol), 
      MaxIterations(maxit),
-      _FineLinop(FineLinop),
-      _Smoother(Smoother)
-  {
-    grid       = fine;
+      _Linop(Linop),
+      _PreconditionerLinop(PrecLinop),
+      _Preconditioner(Preconditioner)
+  { 
+    verbose=0;
  };
-  
-  virtual void operator() (const Field &src, Field &x)
-  {
-    std::cout << GridLogMessage<<"HDCG: fPcg starting"<<std::endl;
+
+  // The Pcg routine is common to all, but the various matrices differ from derived 
+  // implementation to derived implmentation
+  void operator() (const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){
+
+    psi.Checkerboard() = src.Checkerboard();
+    grid             = src.Grid();
+
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
-
+    RealD tn;
+    RealD guess = norm2(psi);
+    RealD ssq   = norm2(src);
+    RealD rsq   = ssq*Tolerance*Tolerance;
+    
    /////////////////////////////
    // Set up history vectors
    /////////////////////////////
-    int mmax = 5;
-    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
-    std::vector<Field> p(mmax,grid);
+    std::vector<Field> p  (mmax,grid);
    std::vector<Field> mmp(mmax,grid);
    std::vector<RealD> pAp(mmax);
-    Field z(grid);
+
+    Field x  (grid); x = psi;
+    Field z  (grid);
    Field tmp(grid);
-    Field  mp (grid);
-    Field  r  (grid);
-    Field  mu (grid);
-    
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated"<<std::endl;
-    //Initial residual computation & set up
-    RealD guess   = norm2(x);
-    std::cout << GridLogMessage<<"HDCG: fPcg guess nrm "<<guess<<std::endl;
-    RealD src_nrm = norm2(src);
-    std::cout << GridLogMessage<<"HDCG: fPcg src nrm "<<src_nrm<<std::endl;
-    
-    if ( src_nrm == 0.0 ) {
-      std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
-      x=Zero();
-    }
-    RealD tn;
-    
-    GridStopWatch HDCGTimer;
-    HDCGTimer.Start();
+    Field r  (grid);
+    Field mu (grid);
+  
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
+    x=src;
    Vstart(x,src);
-    
+
    // r0 = b -A x0
-    _FineLinop.HermOp(x,mmp[0]);
+    HermOp(x,mmp); // Shouldn't this be something else?
    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
-    {
-      double n1 = norm2(x);
-      double n2 = norm2(mmp[0]);
-      double n3 = norm2(r);
-      std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
-    }

    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
-    PcgM1(r,z);
+    M1(r,z,tmp,mp,SmootherMirs);
    rtzp =real(innerProduct(r,z));
-    
+
    ///////////////////////////////////////
    // Solve for Mss mu = P A z and set p = z-mu
-    // Def2 p = 1 - Q Az = Pright z
+    // Def2: p = 1 - Q Az = Pright z 
    // Other algos M2 is trivial
    ///////////////////////////////////////
-    PcgM2(z,p[0]);
-
-    RealD ssq =  norm2(src);
-    RealD rsq =  ssq*Tolerance*Tolerance;
-
-    std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
-
-    Field pp(grid);
+    M2(z,p[0]);

    for (int k=0;k<=MaxIterations;k++){
    
@@ -144,46 +143,31 @@ class TwoLevelCG : public LinearFunction<Field>
      int peri_kp = (k+1) % mmax;

      rtz=rtzp;
-      d= PcgM3(p[peri_k],mmp[peri_k]);
+      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
      a = rtz/d;
    
      // Memorise this
      pAp[peri_k] = d;
-      
+
      axpy(x,a,p[peri_k],x);
      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);

      // Compute z = M x
-      PcgM1(r,z);
-      
-      {
-	RealD n1,n2;
-	n1=norm2(r);
-	n2=norm2(z);
-	std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
-      }
+      M1(r,z,tmp,mp);
+
      rtzp =real(innerProduct(r,z));
-      std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";

-      //    PcgM2(z,p[0]);
-      PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
-      
-      p[peri_kp]=mu;
+      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate

-      // Standard search direction  p -> z + b p    
+      p[peri_kp]=p[peri_k];
+
+      // Standard search direction  p -> z + b p    ; b = 
      b = (rtzp)/rtz;
-      
-      int northog;
-      // k=zero  <=> peri_kp=1;        northog = 1
-      // k=1     <=> peri_kp=2;        northog = 2
-      // ...               ...                  ...
-      // k=mmax-2<=> peri_kp=mmax-1;   northog = mmax-1
-      // k=mmax-1<=> peri_kp=0;        northog = 1

+      int northog;
      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
    
-      std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
      for(int back=0; back < northog; back++){
 	int peri_back = (k-back)%mmax;
 	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
@@ -192,315 +176,75 @@ class TwoLevelCG : public LinearFunction<Field>
      }

      RealD rrn=sqrt(rn/ssq);
-      RealD rtn=sqrt(rtz/ssq);
-      RealD rtnp=sqrt(rtzp/ssq);
-
-      std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
+      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;

      // Stopping condition
      if ( rn <= rsq ) { 

-	HDCGTimer.Stop();
-	std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-	
-	_FineLinop.HermOp(x,mmp[0]);			  
+	HermOp(x,mmp); // Shouldn't this be something else?
 	axpy(tmp,-1.0,src,mmp[0]);
 	
-	RealD  mmpnorm = sqrt(norm2(mmp[0]));
-	RealD  xnorm   = sqrt(norm2(x));
-	RealD  srcnorm = sqrt(norm2(src));
-	RealD  tmpnorm = sqrt(norm2(tmp));
-	RealD  true_residual = tmpnorm/srcnorm;
-	std::cout<<GridLogMessage
-	       <<"HDCG: true residual is "<<true_residual
-	       <<" solution "<<xnorm
-	       <<" source "<<srcnorm
-	       <<" mmp "<<mmpnorm	  
-	       <<std::endl;
-      
-	return;
+	RealD psinorm = sqrt(norm2(x));
+	RealD srcnorm = sqrt(norm2(src));
+	RealD tmpnorm = sqrt(norm2(tmp));
+	RealD true_residual = tmpnorm/srcnorm;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
+	return k;
      }
-
    }
-    HDCGTimer.Stop();
-    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
-    RealD  xnorm   = sqrt(norm2(x));
-    RealD  srcnorm = sqrt(norm2(src));
-    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+    // Non-convergence
+    assert(0);
  }

-
-
-  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
-  {
-    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
-    src[0].Grid()->Barrier();
-    int nrhs = src.size();
-    std::vector<RealD> f(nrhs);
-    std::vector<RealD> rtzp(nrhs);
-    std::vector<RealD> rtz(nrhs);
-    std::vector<RealD> a(nrhs);
-    std::vector<RealD> d(nrhs);
-    std::vector<RealD> b(nrhs);
-    std::vector<RealD> rptzp(nrhs);
-    /////////////////////////////
-    // Set up history vectors
-    /////////////////////////////
-    int mmax = 2;
-    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<std::vector<Field> > p(nrhs);   for(int r=0;r<nrhs;r++)  p[r].resize(mmax,grid);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated p"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated mmp"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated pAp"<<std::endl;
-    src[0].Grid()->Barrier();
-    std::vector<Field> z(nrhs,grid);
-    std::vector<Field>  mp (nrhs,grid);
-    std::vector<Field>  r  (nrhs,grid);
-    std::vector<Field>  mu (nrhs,grid);
-    std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<<std::endl;
-    src[0].Grid()->Barrier();
-
-    //Initial residual computation & set up
-    std::vector<RealD> src_nrm(nrhs);
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      src_nrm[rhs]=norm2(src[rhs]);
-      assert(src_nrm[rhs]!=0.0);
-    }
-    std::vector<RealD> tn(nrhs);
-
-    GridStopWatch HDCGTimer;
-    HDCGTimer.Start();
-    //////////////////////////
-    // x0 = Vstart -- possibly modify guess
-    //////////////////////////
-    for(int rhs=0;rhs<nrhs;rhs++){
-      Vstart(x[rhs],src[rhs]);
-
-      // r0 = b -A x0
-      _FineLinop.HermOp(x[rhs],mmp[rhs][0]);
-      axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]);    // Recomputes r=src-Ax0
-    }
-
-    //////////////////////////////////
-    // Compute z = M1 x
-    //////////////////////////////////
-    // This needs a multiRHS version for acceleration
-    PcgM1(r,z);
-
-    std::vector<RealD> ssq(nrhs);
-    std::vector<RealD> rsq(nrhs);
-    std::vector<Field> pp(nrhs,grid);
-
-    for(int rhs=0;rhs<nrhs;rhs++){
-      rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
-      p[rhs][0]=z[rhs];
-      ssq[rhs]=norm2(src[rhs]);
-      rsq[rhs]=  ssq[rhs]*Tolerance*Tolerance;
-      std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
-    }
-
-    std::vector<RealD> rn(nrhs);
-    for (int k=0;k<=MaxIterations;k++){
-    
-      int peri_k  = k % mmax;
-      int peri_kp = (k+1) % mmax;
-
-      for(int rhs=0;rhs<nrhs;rhs++){
-	rtz[rhs]=rtzp[rhs];
-	d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
-	a[rhs] = rtz[rhs]/d[rhs];
-    
-	// Memorise this
-	pAp[rhs][peri_k] = d[rhs];
-
-	axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
-	rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
-      }
-
-      // Compute z = M x (for *all* RHS)
-      PcgM1(r,z);
-
-      RealD max_rn=0.0;
-      for(int rhs=0;rhs<nrhs;rhs++){
-
-	rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
-
-	std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
-	
-	mu[rhs]=z[rhs];
-
-	p[rhs][peri_kp]=mu[rhs];
-
-	// Standard search direction p == z + b p 
-	b[rhs] = (rtzp[rhs])/rtz[rhs];
-
-	int northog = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
-	std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
-	for(int back=0; back < northog; back++){
-	  int peri_back = (k-back)%mmax;
-	  RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
-	  RealD beta = -pbApk/pAp[rhs][peri_back];
-	  axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
-	}
-
-	RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
-	RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
-	RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
-	
-	std::cout<<GridLogMessage<<"HDCG: rhs "<<rhs<<"fPcg k= "<<k<<" residual = "<<rrn<<"\n";
-	if ( rrn > max_rn ) max_rn = rrn;
-      }
-
-      // Stopping condition based on worst case
-      if ( max_rn <= Tolerance ) { 
-
-	HDCGTimer.Stop();
-	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-
-	for(int rhs=0;rhs<nrhs;rhs++){
-	  _FineLinop.HermOp(x[rhs],mmp[rhs][0]);			  
-	  Field tmp(grid);
-	  axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
-      
-	  RealD  mmpnorm = sqrt(norm2(mmp[rhs][0]));
-	  RealD  xnorm   = sqrt(norm2(x[rhs]));
-	  RealD  srcnorm = sqrt(norm2(src[rhs]));
-	  RealD  tmpnorm = sqrt(norm2(tmp));
-	  RealD  true_residual = tmpnorm/srcnorm;
-	  std::cout<<GridLogMessage
-		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
-		   <<" solution "<<xnorm
-		   <<" source "<<srcnorm
-		   <<" mmp "<<mmpnorm	  
-		   <<std::endl;
-	}
-	return;
-      }
-      
-    }
-    HDCGTimer.Stop();
-    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
-    for(int rhs=0;rhs<nrhs;rhs++){
-      RealD  xnorm   = sqrt(norm2(x[rhs]));
-      RealD  srcnorm = sqrt(norm2(src[rhs]));
-      std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
-    }
-  }
-  
-
 public:

-  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
-  {
-    std::cout << "PcgM1 default (cheat) mrhs versoin"<<std::endl;
-    for(int rhs=0;rhs<in.size();rhs++){
-      this->PcgM1(in[rhs],out[rhs]);
-    }
-  }
-  virtual void PcgM1(Field & in, Field & out)     =0;
-  virtual void Vstart(Field & x,const Field & src)=0;
+  virtual void M(Field & in,Field & out,Field & tmp) {

-  virtual void PcgM2(const Field & in, Field & out) {
-    out=in;
  }

-  virtual RealD PcgM3(const Field & p, Field & mmp){
-    RealD dd;
-    _FineLinop.HermOp(p,mmp);
-    ComplexD dot = innerProduct(p,mmp);
-    dd=real(dot);
-    return dd;
-  }
+  virtual void M1(Field & in, Field & out) {// the smoother

-  /////////////////////////////////////////////////////////////////////
-  // Only Def1 has non-trivial Vout.
-  /////////////////////////////////////////////////////////////////////
-
-};
-  
-template<class Field, class CoarseField, class Aggregation>
-class TwoLevelADEF2 : public TwoLevelCG<Field>
-{
- public:
-  ///////////////////////////////////////////////////////////////////////////////////
-  // Need something that knows how to get from Coarse to fine and back again
-  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-  ///////////////////////////////////////////////////////////////////////////////////
-  GridBase *coarsegrid;
-  Aggregation &_Aggregates;                    
-  LinearFunction<CoarseField> &_CoarseSolver;
-  LinearFunction<CoarseField> &_CoarseSolverPrecise;
-  ///////////////////////////////////////////////////////////////////////////////////
-  
-  // more most opertor functions
-  TwoLevelADEF2(RealD tol,
-		Integer maxit,
-		LinearOperatorBase<Field>    &FineLinop,
-		LinearFunction<Field>        &Smoother,
-		LinearFunction<CoarseField>  &CoarseSolver,
-		LinearFunction<CoarseField>  &CoarseSolverPrecise,
-		Aggregation &Aggregates
-		) :
-      TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
-      _CoarseSolver(CoarseSolver),
-      _CoarseSolverPrecise(CoarseSolverPrecise),
-      _Aggregates(Aggregates)
-  {
-    coarsegrid = Aggregates.CoarseGrid;
-  };
-
-  virtual void PcgM1(Field & in, Field & out)
-  {
-    GRID_TRACE("MultiGridPreconditioner ");
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
+    Field tmp(grid);
+    Field Min(grid);

-    Field tmp(this->grid);
-    Field Min(this->grid);
-    CoarseField PleftProj(this->coarsegrid);
-    CoarseField PleftMss_proj(this->coarsegrid);
+    PcgM(in,Min); // Smoother call

-    GridStopWatch SmootherTimer;
-    GridStopWatch MatrixTimer;
-    SmootherTimer.Start();
-    this->_Smoother(in,Min);
-    SmootherTimer.Stop();
-
-    MatrixTimer.Start();
-    this->_FineLinop.HermOp(Min,out);
-    MatrixTimer.Stop();
+    HermOp(Min,out);
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min

-    GridStopWatch ProjTimer;
-    GridStopWatch CoarseTimer;
-    GridStopWatch PromTimer;
-    ProjTimer.Start();
-    this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     
-    ProjTimer.Stop();
-    CoarseTimer.Start();
-    this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    CoarseTimer.Stop();
-    PromTimer.Start();
-    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
-    PromTimer.Stop();
-    std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
-    std::cout << GridLogPerformance << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
-    std::cout << GridLogPerformance << "\tProm       " << PromTimer.Elapsed() <<std::endl;
-
+    ProjectToSubspace(tmp,PleftProj);     
+    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    axpy(out,1.0,Min,tmp); // Min+tmp
  }

-  virtual void Vstart(Field & x,const Field & src)
-  {
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart "<<std::endl;
+  virtual void M2(const Field & in, Field & out) {
+    out=in;
+    // Must override for Def2 only
+    //  case PcgDef2:
+    //    Pright(in,out);
+    //    break;
+  }
+
+  virtual RealD M3(const Field & p, Field & mmp){
+    double d,dd;
+    HermOpAndNorm(p,mmp,d,dd);
+    return dd;
+    // Must override for Def1 only
+    //  case PcgDef1:
+    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
+    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
+    //    Pleft(mp,mmp);
+    //    d=real(linop_d->inner(p,mmp));
+  }
+
+  virtual void VstartDef2(Field & xconst Field & src){
+    //case PcgDef2:
+    //case PcgAdef2: 
+    //case PcgAdef2f:
+    //case PcgV11f:
    ///////////////////////////////////
    // Choose x_0 such that 
    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@@ -512,157 +256,142 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(this->grid);
-    Field mmp(this->grid);
-    CoarseField PleftProj(this->coarsegrid);
-    CoarseField PleftMss_proj(this->coarsegrid);
-
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<<std::endl;
-    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<<std::endl;
-    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
-    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  
+    Field r(grid);
+    Field mmp(grid);
+    
+    HermOp(x,mmp);
+    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
+    ProjectToSubspace(r,PleftProj);     
+    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    PromoteFromSubspace(PleftMss_proj,mmp);  
+    x=x+mmp;

  }

-};
-
-template<class Field, class CoarseField, class Aggregation>
-class TwoLevelADEF2mrhs : public TwoLevelADEF2<Field,CoarseField,Aggregation>
-{
-public:
-  GridBase *coarsegridmrhs;
-  LinearFunction<CoarseField> &_CoarseSolverMrhs;
-  LinearFunction<CoarseField> &_CoarseGuesser;
-  TwoLevelADEF2mrhs(RealD tol,
-		    Integer maxit,
-		    LinearOperatorBase<Field>    &FineLinop,
-		    LinearFunction<Field>        &Smoother,
-		    LinearFunction<CoarseField>  &CoarseSolver,
-		    LinearFunction<CoarseField>  &CoarseSolverPrecise,
-		    LinearFunction<CoarseField>  &CoarseSolverMrhs,
-		    LinearFunction<CoarseField>  &CoarseGuesser,
-		    GridBase *rhsgrid,
-		    Aggregation &Aggregates) :
-    TwoLevelADEF2<Field,CoarseField,Aggregation>(tol, maxit,FineLinop,Smoother,CoarseSolver,CoarseSolverPrecise,Aggregates),
-    _CoarseSolverMrhs(CoarseSolverMrhs),
-    _CoarseGuesser(CoarseGuesser)
-  {
-    coarsegridmrhs = rhsgrid;
-  };
-  
-  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
-
-    int nrhs=in.size();
-    std::cout << " mrhs PcgM1 for "<<nrhs<<" right hand sides"<<std::endl;
-    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    Field tmp(this->grid);
-    std::vector<Field> Min(nrhs,this->grid);
-    CoarseField PleftProj(this->coarsegrid);
-    CoarseField PleftMss_proj(this->coarsegrid);
-
-    CoarseField PleftProjMrhs(this->coarsegridmrhs);
-    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
-
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      this->grid->Barrier();
-      std::cout << " Calling smoother for "<<rhs<<std::endl;
-      this->grid->Barrier();
-      this->_Smoother(in[rhs],Min[rhs]);
-      this->grid->Barrier();
-      std::cout << " smoother done "<<rhs<<std::endl;
-      this->grid->Barrier();
-      this->_FineLinop.HermOp(Min[rhs],out[rhs]);
-      this->grid->Barrier();
-      std::cout << " Hermop for "<<rhs<<std::endl;
-      this->grid->Barrier();
-      axpy(tmp,-1.0,out[rhs],in[rhs]);          // tmp  = in - A Min
-      this->grid->Barrier();
-      std::cout << " axpy "<<rhs<<std::endl;
-      this->grid->Barrier();
-      this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     // can optimise later
-      this->grid->Barrier();
-      std::cout << " project "<<rhs<<std::endl;
-      this->grid->Barrier();
-      InsertSlice(PleftProj,PleftProjMrhs,rhs,0);
-      this->grid->Barrier();
-      std::cout << " insert rhs "<<rhs<<std::endl;
-      this->grid->Barrier();
-      this->_CoarseGuesser(PleftProj,PleftMss_proj);
-      this->grid->Barrier();
-      std::cout << " insert guess "<<rhs<<std::endl;
-      this->grid->Barrier();
-      InsertSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
-    }
-
-    std::cout << " Coarse solve "<<std::endl;
-    this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
-
-    for(int rhs=0;rhs<nrhs;rhs++) {
-      ExtractSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
-      this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
-      axpy(out[rhs],1.0,Min[rhs],tmp); // Min+tmp
-    }
-    std::cout << " Extracted "<<std::endl;
-  }
-};
-  
-template<class Field>
-class TwoLevelADEF1defl : public TwoLevelCG<Field>
-{
-public:
-  const std::vector<Field> &evec;
-  const std::vector<RealD> &eval;
-  
-  TwoLevelADEF1defl(RealD tol,
-		   Integer maxit,
-		   LinearOperatorBase<Field>   &FineLinop,
-		   LinearFunction<Field>   &Smoother,
-		   std::vector<Field> &_evec,
-		   std::vector<RealD> &_eval) : 
-    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
-    evec(_evec),
-    eval(_eval)
-  {};
-
-  // Can just inherit existing M2
-  // Can just inherit existing M3
-
-  // Simple vstart - do nothing
  virtual void Vstart(Field & x,const Field & src){
-    x=src; // Could apply Q
-  };
-
-  // Override PcgM1
-  virtual void PcgM1(Field & in, Field & out)
-  {
-    GRID_TRACE("EvecPreconditioner ");
-    int N=evec.size();
-    Field Pin(this->grid);
-    Field Qin(this->grid);
-
-    //MP  + Q = M(1-AQ) + Q = M
-    // // If we are eigenvector deflating in coarse space
-    // // Q   = Sum_i |phi_i> 1/lambda_i <phi_i|
-    // // A Q = Sum_i |phi_i> <phi_i|
-    // // M(1-AQ) = M(1-proj) + Q
-    Qin.Checkerboard()=in.Checkerboard();
-    Qin = Zero();
-    Pin = in;
-    for (int i=0;i<N;i++) {
-      const Field& tmp = evec[i];
-      auto ip = TensorRemove(innerProduct(tmp,in));
-      axpy(Qin, ip / eval[i],tmp,Qin);
-      axpy(Pin, -ip ,tmp,Pin);
-    }
-
-    this->_Smoother(Pin,out);
-
-    out = out + Qin;
+    return;
  }
-};

-NAMESPACE_END(Grid);
+  /////////////////////////////////////////////////////////////////////
+  // Only Def1 has non-trivial Vout. Override in Def1
+  /////////////////////////////////////////////////////////////////////
+  virtual void   Vout  (Field & in, Field & out,Field & src){
+    out = in;
+    //case PcgDef1:
+    //    //Qb + PT x
+    //    ProjectToSubspace(src,PleftProj);     
+    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    //    PromoteFromSubspace(PleftMss_proj,tmp);  
+    //    
+    //    Pright(in,out);
+    //    
+    //    linop_d->axpy(out,tmp,out,1.0);
+    //    break;
+  }

+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // Pright and Pleft are common to all implementations
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  virtual void Pright(Field & in,Field & out){
+    // P_R  = [ 1              0 ] 
+    //        [ -Mss^-1 Msb    0 ] 
+    Field in_sbar(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
+
+    HermOp(in_sbar,out);
+    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
+
+    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
+    PromoteFromSubspace(PleftMss_proj,out);     // 
+
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
+  }
+  virtual void Pleft (Field & in,Field & out){
+    // P_L  = [ 1  -Mbs Mss^-1] 
+    //        [ 0   0         ] 
+    Field in_sbar(grid);
+    Field    tmp2(grid);
+    Field    Mtmp(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
+
+    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
+    PromoteFromSubspace(PleftMss_proj,out);
+
+    HermOp(out,Mtmp);
+
+    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
+    PromoteFromSubspace(PleftProj,tmp2);
+
+    axpy(out,-1.0,tmp2,Mtmp);
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
+  }
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp){
+
+  } 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
+
+  }
+  virtual void M2(Field & in, Field & out){
+
+  }
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
+
+  }
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
+
+  }
+}
+/*
+template<class Field>
+class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+*/
 #endif
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -36,8 +36,7 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:
-    using LinearFunction<FieldD>::operator();
+  public:                                                
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -58,7 +58,6 @@ public:

  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {

-    GRID_TRACE("ConjugateGradient");
    psi.Checkerboard() = src.Checkerboard();

    conformable(psi, src);
@@ -103,7 +102,7 @@ public:
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      TrueResidual = std::sqrt(a/ssq);
-      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
+      std::cout << GridLogMessage << "ConjugateGradient guess is converged already "<<TrueResidual<< " tol "<< Tolerance<< std::endl;
      IterationsToComplete = 0;	
      return;
    }
@@ -118,13 +117,9 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;

-    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
-
-      GridStopWatch IterationTimer;
-      IterationTimer.Start();
      c = cp;

      MatrixTimer.Start();
@@ -157,41 +152,31 @@ public:
      LinearCombTimer.Stop();
      LinalgTimer.Stop();

-      IterationTimer.Stop();
-      if ( (k % 500) == 0 ) {
-	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
-      } else { 
-	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
-      }

      // Stopping condition
      if (cp <= rsq) {
-	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-	GridBase *grid = src.Grid();
-	RealD DwfFlops = (1452. )*grid->gSites()*4*k
-   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
+
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
+
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;

-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
-	std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
-
-	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

@@ -207,8 +192,7 @@ public:

    TrueResidual = sqrt(norm2(p)/ssq);

-    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
-	      <<" residual "<< TrueResidual<< std::endl;
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;

    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -35,8 +35,7 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
-  public:
-    using LinearFunction<FieldD>::operator();
+  public:                                                
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -53,23 +52,31 @@ NAMESPACE_BEGIN(Grid);

    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
-    
-    MixedPrecisionConjugateGradient(RealD tol, 
+
+    MixedPrecisionConjugateGradient(RealD Tol,
+				    Integer maxinnerit, 
+				    Integer maxouterit, 
+				    GridBase* _sp_grid, 
+				    LinearOperatorBase<FieldF> &_Linop_f, 
+				    LinearOperatorBase<FieldD> &_Linop_d) :
+      MixedPrecisionConjugateGradient(Tol, Tol, maxinnerit, maxouterit, _sp_grid, _Linop_f, _Linop_d) {};
+
+    MixedPrecisionConjugateGradient(RealD Tol,
+				    RealD InnerTol,
 				    Integer maxinnerit, 
 				    Integer maxouterit, 
 				    GridBase* _sp_grid, 
 				    LinearOperatorBase<FieldF> &_Linop_f, 
 				    LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
-      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
-      OuterLoopNormMult(100.), guesser(NULL){ };
+      Tolerance(Tol), InnerTolerance(InnerTol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      OuterLoopNormMult(100.), guesser(NULL){ assert(InnerTol < 1.0e-1);};

    void useGuesser(LinearFunction<FieldF> &g){
      guesser = &g;
    }
  
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
-    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
 	
    GridStopWatch TotalTimer;
@@ -82,6 +89,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;

    GridBase* DoublePrecGrid = src_d_in.Grid();
+
+    //Generate precision change workspaces
+    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
+    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
+
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
    
@@ -99,7 +111,6 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    
-    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;

@@ -108,10 +119,7 @@ NAMESPACE_BEGIN(Grid);
    GridStopWatch PrecChangeTimer;
    
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-
-    precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
-    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
-    
+      
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
@@ -126,7 +134,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      
      sol_f = Zero();
@@ -136,7 +144,6 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);

      //Inner CG
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -145,7 +152,7 @@ NAMESPACE_BEGIN(Grid);
      
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      
      axpy(sol_d, 1.0, tmp_d, sol_d);
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
@@ -1,213 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
-
-    Copyright (C) 2015
-
-    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
-#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
-
-NAMESPACE_BEGIN(Grid);
-
-//Mixed precision restarted defect correction CG
-template<class FieldD,class FieldF, 
-  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
-  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
-class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
-public:
-  using LinearFunction<FieldD>::operator();
-  RealD   Tolerance;
-  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-  Integer MaxInnerIterations;
-  Integer MaxOuterIterations;
-  Integer MaxPatchupIterations;
-  GridBase* SinglePrecGrid; //Grid for single-precision fields
-  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
-  LinearOperatorBase<FieldF> &Linop_f;
-  LinearOperatorBase<FieldD> &Linop_d;
-
-  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
-  LinearFunction<FieldF> *guesser;
-  bool updateResidual;
-  
-  MixedPrecisionConjugateGradientBatched(RealD tol, 
-          Integer maxinnerit, 
-          Integer maxouterit, 
-          Integer maxpatchit,
-          GridBase* _sp_grid, 
-          LinearOperatorBase<FieldF> &_Linop_f, 
-          LinearOperatorBase<FieldD> &_Linop_d,
-          bool _updateResidual=true) :
-    Linop_f(_Linop_f), Linop_d(_Linop_d),
-    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
-    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
-
-  void useGuesser(LinearFunction<FieldF> &g){
-    guesser = &g;
-  }
-  
-  void operator() (const FieldD &src_d_in, FieldD &sol_d){
-    std::vector<FieldD> srcs_d_in{src_d_in};
-    std::vector<FieldD> sols_d{sol_d};
-
-    (*this)(srcs_d_in,sols_d);
-
-    sol_d = sols_d[0];
-  }
-
-  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
-    assert(src_d_in.size() == sol_d.size());
-    int NBatch = src_d_in.size();
-
-    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
-
-    Integer TotalOuterIterations = 0; //Number of restarts
-    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
-    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
-  
-    GridStopWatch TotalTimer;
-    TotalTimer.Start();
-
-    GridStopWatch InnerCGtimer;
-    GridStopWatch PrecChangeTimer;
-    
-    int cb = src_d_in[0].Checkerboard();
-    
-    std::vector<RealD> src_norm;
-    std::vector<RealD> norm;
-    std::vector<RealD> stop;
-    
-    GridBase* DoublePrecGrid = src_d_in[0].Grid();
-    FieldD tmp_d(DoublePrecGrid);
-    tmp_d.Checkerboard() = cb;
-    
-    FieldD tmp2_d(DoublePrecGrid);
-    tmp2_d.Checkerboard() = cb;
-
-    std::vector<FieldD> src_d;
-    std::vector<FieldF> src_f;
-    std::vector<FieldF> sol_f;
-
-    for (int i=0; i<NBatch; i++) {
-      sol_d[i].Checkerboard() = cb;
-
-      src_norm.push_back(norm2(src_d_in[i]));
-      norm.push_back(0.);
-      stop.push_back(src_norm[i] * Tolerance*Tolerance);
-
-      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
-
-      src_f.push_back(SinglePrecGrid);
-      src_f[i].Checkerboard() = cb;
-
-      sol_f.push_back(SinglePrecGrid);
-      sol_f[i].Checkerboard() = cb;
-    }
-    
-    RealD inner_tol = InnerTolerance;
-    
-    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
-    CG_f.ErrorOnNoConverge = false;
-    
-    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-      
-    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
-      std::cout << GridLogMessage << std::endl;
-      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
-      
-      bool allConverged = true;
-      
-      for (int i=0; i<NBatch; i++) {
-        //Compute double precision rsd and also new RHS vector.
-        Linop_d.HermOp(sol_d[i], tmp_d);
-        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
-        
-        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
-
-        PrecChangeTimer.Start();
-        precisionChange(src_f[i], src_d[i]);
-        PrecChangeTimer.Stop();
-        
-        sol_f[i] = Zero();
-      
-        if(norm[i] > OuterLoopNormMult * stop[i]) {
-          allConverged = false;
-        }
-      }
-      if (allConverged) break;
-
-      if (updateResidual) {
-        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
-        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
-        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
-        CG_f.Tolerance = inner_tol;
-      }
-
-      //Optionally improve inner solver guess (eg using known eigenvectors)
-      if(guesser != NULL) {
-        (*guesser)(src_f, sol_f);
-      }
-
-      for (int i=0; i<NBatch; i++) {
-        //Inner CG
-        InnerCGtimer.Start();
-        CG_f(Linop_f, src_f[i], sol_f[i]);
-        InnerCGtimer.Stop();
-        TotalInnerIterations[i] += CG_f.IterationsToComplete;
-        
-        //Convert sol back to double and add to double prec solution
-        PrecChangeTimer.Start();
-        precisionChange(tmp_d, sol_f[i]);
-        PrecChangeTimer.Stop();
-        
-        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
-      }
-
-    }
-    
-    //Final trial CG
-    std::cout << GridLogMessage << std::endl;
-    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
-    
-    for (int i=0; i<NBatch; i++) {
-      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
-      CG_d(Linop_d, src_d_in[i], sol_d[i]);
-      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
-    }
-
-    TotalTimer.Stop();
-
-    std::cout << GridLogMessage << std::endl;
-    for (int i=0; i<NBatch; i++) {
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
-    }
-    std::cout << GridLogMessage << std::endl;
-    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
-    
-  }
-};
-
-NAMESPACE_END(Grid);
-
-#endif
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -44,7 +44,7 @@ public:

  using OperatorFunction<Field>::operator();

-  //  RealD   Tolerance;
+  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -84,7 +84,6 @@ public:

  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
  {
-    GRID_TRACE("ConjugateGradientMultiShift");
  
    GridBase *grid = src.Grid();
  
@@ -144,7 +143,7 @@ public:
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
-	       <<" target resid^2 "<<rsq[s]<<std::endl;
+	       <<" target resid "<<rsq[s]<<std::endl;
      ps[s] = src;
    }
    // r and p for primary
@@ -325,8 +324,8 @@ public:

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;

      IterationsToComplete = k;	
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
@@ -1,373 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Christopher Kelly <ckelly@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
-//The residual is stored in single precision, but the search directions and solution are stored in double precision. 
-//Every update_freq iterations the residual is corrected in double precision. 
-//For safety the a final regular CG is applied to clean up if necessary
-
-//PB Pure single, then double fixup
-
-template<class FieldD, class FieldF,
-	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
-	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
-class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
-					     public OperatorFunction<FieldD>
-{
-public:                                                
-
-  using OperatorFunction<FieldD>::operator();
-
-  RealD   Tolerance;
-  Integer MaxIterationsMshift;
-  Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
-  int verbose;
-  MultiShiftFunction shifts;
-  std::vector<RealD> TrueResidualShift;
-
-  int ReliableUpdateFreq; //number of iterations between reliable updates
-
-  GridBase* SinglePrecGrid; //Grid for single-precision fields
-  LinearOperatorBase<FieldF> &Linop_f; //single precision
-
-  ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
-				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
-				       int _ReliableUpdateFreq) : 
-    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
-    MaxIterations(20000)
-  { 
-    verbose=1;
-    IterationsToCompleteShift.resize(_shifts.order);
-    TrueResidualShift.resize(_shifts.order);
-  }
-
-  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
-  {
-    GridBase *grid = src.Grid();
-    int nshift = shifts.order;
-    std::vector<FieldD> results(nshift,grid);
-    (*this)(Linop,src,results,psi);
-  }
-  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
-  {
-    int nshift = shifts.order;
-
-    (*this)(Linop,src,results);
-  
-    psi = shifts.norm*src;
-    for(int i=0;i<nshift;i++){
-      psi = psi + shifts.residues[i]*results[i];
-    }
-
-    return;
-  }
-
-  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
-  { 
-    GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
-    GridBase *DoublePrecGrid = src_d.Grid();
-
-    ////////////////////////////////////////////////////////////////////////
-    // Convenience references to the info stored in "MultiShiftFunction"
-    ////////////////////////////////////////////////////////////////////////
-    int nshift = shifts.order;
-
-    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
-    std::vector<RealD> &mresidual(shifts.tolerances);
-    std::vector<RealD> alpha(nshift,1.0);
-
-    //Double precision search directions
-    FieldD p_d(DoublePrecGrid);
-    std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
-    std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
-
-    FieldD tmp_d(DoublePrecGrid);
-    FieldD r_d(DoublePrecGrid);
-    FieldF r_f(SinglePrecGrid);
-    FieldD mmp_d(DoublePrecGrid);
-
-    assert(psi_d.size()==nshift);
-    assert(mass.size()==nshift);
-    assert(mresidual.size()==nshift);
-  
-    // dynamic sized arrays on stack; 2d is a pain with vector
-    RealD  bs[nshift];
-    RealD  rsq[nshift];
-    RealD  rsqf[nshift];
-    RealD  z[nshift][2];
-    int     converged[nshift];
-  
-    const int       primary =0;
-  
-    //Primary shift fields CG iteration
-    RealD a,b,c,d;
-    RealD cp,bp,qq; //prev
-  
-    // Matrix mult fields
-    FieldF p_f(SinglePrecGrid);
-    FieldF mmp_f(SinglePrecGrid);
-
-    // Check lightest mass
-    for(int s=0;s<nshift;s++){
-      assert( mass[s]>= mass[primary] );
-      converged[s]=0;
-    }
-  
-    // Wire guess to zero
-    // Residuals "r" are src
-    // First search direction "p" is also src
-    cp = norm2(src_d);
-
-    // Handle trivial case of zero src.
-    if( cp == 0. ){
-      for(int s=0;s<nshift;s++){
-	psi_d[s] = Zero();
-	psi_f[s] = Zero();
-	IterationsToCompleteShift[s] = 1;
-	TrueResidualShift[s] = 0.;
-      }
-      return;
-    }
-
-    for(int s=0;s<nshift;s++){
-      rsq[s] = cp * mresidual[s] * mresidual[s];
-      rsqf[s] =rsq[s];
-      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
-      //      ps_d[s] = src_d;
-      precisionChange(ps_f[s],src_d);
-    }
-    // r and p for primary
-    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
-    r_d = p_d;
-    
-    //MdagM+m[0]
-    precisionChange(p_f,p_d);
-    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChange(tmp_d,mmp_f);
-    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    tmp_d = tmp_d - mmp_d;
-    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    //    assert(norm2(tmp_d)< 1.0e-4);
-
-    axpy(mmp_d,mass[0],p_d,mmp_d);
-    RealD rn = norm2(p_d);
-    d += rn*mass[0];
-
-    b = -cp /d;
-  
-    // Set up the various shift variables
-    int       iz=0;
-    z[0][1-iz] = 1.0;
-    z[0][iz]   = 1.0;
-    bs[0]      = b;
-    for(int s=1;s<nshift;s++){
-      z[s][1-iz] = 1.0;
-      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
-      bs[s]      = b*z[s][iz]; 
-    }
-  
-    // r += b[0] A.p[0]
-    // c= norm(r)
-    c=axpy_norm(r_d,b,mmp_d,r_d);
-  
-    for(int s=0;s<nshift;s++) {
-      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
-      precisionChange(psi_f[s],psi_d[s]);
-    }
-  
-    ///////////////////////////////////////
-    // Timers
-    ///////////////////////////////////////
-    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
-
-    SolverTimer.Start();
-  
-    // Iteration loop
-    int k;
-  
-    for (k=1;k<=MaxIterationsMshift;k++){    
-
-      a = c /cp;
-      AXPYTimer.Start();
-      axpy(p_d,a,p_d,r_d); 
-      AXPYTimer.Stop();
-
-      PrecChangeTimer.Start();
-      precisionChange(r_f, r_d);
-      PrecChangeTimer.Stop();
-
-      AXPYTimer.Start();
-      for(int s=0;s<nshift;s++){
-	if ( ! converged[s] ) { 
-	  if (s==0){
-	    axpy(ps_f[s],a,ps_f[s],r_f);
-	  } else{
-	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
-	    axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
-	  }
-	}
-      }
-      AXPYTimer.Stop();
-
-      cp=c;
-      PrecChangeTimer.Start();
-      precisionChange(p_f, p_d); //get back single prec search direction for linop
-      PrecChangeTimer.Stop();
-      MatrixTimer.Start();  
-      Linop_f.HermOp(p_f,mmp_f);
-      MatrixTimer.Stop();  
-      PrecChangeTimer.Start();
-      precisionChange(mmp_d, mmp_f); // From Float to Double
-      PrecChangeTimer.Stop();
-
-      d=real(innerProduct(p_d,mmp_d));    
-      axpy(mmp_d,mass[0],p_d,mmp_d);
-      RealD rn = norm2(p_d);
-      d += rn*mass[0];
-    
-      bp=b;
-      b=-cp/d;
-
-      // Toggle the recurrence history
-      bs[0] = b;
-      iz = 1-iz;
-      ShiftTimer.Start();
-      for(int s=1;s<nshift;s++){
-	if((!converged[s])){
-	  RealD z0 = z[s][1-iz];
-	  RealD z1 = z[s][iz];
-	  z[s][iz] = z0*z1*bp
-	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
-	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
-	}
-      }
-      ShiftTimer.Stop();
-
-      //Update single precision solutions
-      AXPYTimer.Start();
-      for(int s=0;s<nshift;s++){
-	int ss = s;
-	if( (!converged[s]) ) { 
-	  axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
-	}
-      }
-      c = axpy_norm(r_d,b,mmp_d,r_d);
-      AXPYTimer.Stop();
-    
-      // Convergence checks
-      int all_converged = 1;
-      for(int s=0;s<nshift;s++){
-      
-	if ( (!converged[s]) ){
-	  IterationsToCompleteShift[s] = k;
-	
-	  RealD css  = c * z[s][iz]* z[s][iz];
-	
-	  if(css<rsqf[s]){
-	    if ( ! converged[s] )
-	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
-	    converged[s]=1;
-	  } else {
-	    all_converged=0;
-	  }
-
-	}
-      }
-
-      if ( all_converged || k == MaxIterationsMshift-1){
-
-	SolverTimer.Stop();
-
-	for(int s=0;s<nshift;s++){
-	  precisionChange(psi_d[s],psi_f[s]);
-	}
-
-	
-	if ( all_converged ){
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
-	} else {
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
-	}
-	
-	// Check answers 
-	for(int s=0; s < nshift; s++) { 
-	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
-	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
-	  axpy(r_d,-alpha[s],src_d,tmp_d);
-	  RealD rn = norm2(r_d);
-	  RealD cn = norm2(src_d);
-	  TrueResidualShift[s] = std::sqrt(rn/cn);
-	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
-
-	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
-	  if(rn >= rsq[s]){
-	    CleanupTimer.Start();
-	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
-
-	    //Setup linear operators for final cleanup
-	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
-	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
-					       
-	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
-	    cg(src_d, psi_d[s]);
-	    
-	    TrueResidualShift[s] = cg.TrueResidual;
-	    CleanupTimer.Stop();
-	  }
-	}
-
-	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
-	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
-
-	IterationsToComplete = k;	
-
-	return;
-      }
-   
-    }
-    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    assert(0);
-  }
-
-};
-NAMESPACE_END(Grid);
-
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -81,7 +81,6 @@ public:
  using OperatorFunction<FieldD>::operator();

  RealD   Tolerance;
-  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -96,9 +95,9 @@ public:

  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
-				       int _ReliableUpdateFreq) : 
-    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
-    MaxIterations(20000)
+				       int _ReliableUpdateFreq
+				       ) : 
+    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
@@ -128,12 +127,10 @@ public:

  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
-    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();
+    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
+    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);

-    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
-    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
-    
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
@@ -158,7 +155,6 @@ public:
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
-    RealD  rsqf[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
  
@@ -169,8 +165,12 @@ public:
    RealD cp,bp,qq; //prev
  
    // Matrix mult fields
+    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
+    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
+    FieldF src_f(SinglePrecGrid);
+    precisionChange(src_f, src_d, wk_f_from_d);

    // Check lightest mass
    for(int s=0;s<nshift;s++){
@@ -195,26 +195,18 @@ public:

    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
-      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
+    r_f=src_f; //residual maintained in single
+    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
-    r_d = p_d;
-    
+  
    //MdagM+m[0]
-    precisionChange(p_f, p_d, pc_wk_d_to_s);
-
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
-    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
-    tmp_d = tmp_d - mmp_d;
-    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    assert(norm2(tmp_d)< 1.0);
-
-    axpy(mmp_d,mass[0],p_d,mmp_d);
-    RealD rn = norm2(p_d);
+    axpy(mmp_f,mass[0],p_f,mmp_f);
+    RealD rn = norm2(p_f);
    d += rn*mass[0];

    b = -cp /d;
@@ -232,7 +224,7 @@ public:
  
    // r += b[0] A.p[0]
    // c= norm(r)
-    c=axpy_norm(r_d,b,mmp_d,r_d);
+    c=axpy_norm(r_f,b,mmp_f,r_f);
  
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
@@ -248,9 +240,14 @@ public:
    // Iteration loop
    int k;
  
-    for (k=1;k<=MaxIterationsMshift;k++){    
-
+    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
+
+      //Update double precision search direction by residual
+      PrecChangeTimer.Start();
+      precisionChange(r_d, r_f, wk_d_from_f);
+      PrecChangeTimer.Stop();
+
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 

@@ -267,28 +264,24 @@ public:
      AXPYTimer.Stop();

      PrecChangeTimer.Start();
-      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
+      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();

      cp=c;
      MatrixTimer.Start();  
-      Linop_f.HermOp(p_f,mmp_f);
+      Linop_f.HermOp(p_f,mmp_f); 
+      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  

-      PrecChangeTimer.Start();
-      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
-      PrecChangeTimer.Stop();
-
      AXPYTimer.Start();
-      d=real(innerProduct(p_d,mmp_d));    
-      axpy(mmp_d,mass[0],p_d,mmp_d);
+      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
-      RealD rn = norm2(p_d);
+      RealD rn = norm2(p_f);
      d += rn*mass[0];
    
      bp=b;
      b=-cp/d;
-
+    
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
@@ -314,12 +307,12 @@ public:
      }

      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
-      c = axpy_norm(r_d,b,mmp_d,r_d);
-
+      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();

+      c = c_f;
+
      if(k % ReliableUpdateFreq == 0){
-	RealD c_old = c;
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
@@ -328,10 +321,15 @@ public:
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);

-	c = axpy_norm(r_d, -1.0, mmp_d, src_d);
+	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();

-	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
+	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
+	
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r_d, wk_f_from_d);
+	PrecChangeTimer.Stop();
+	c = c_d;
      }
    
      // Convergence checks
@@ -343,7 +341,7 @@ public:
 	
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	
-	  if(css<rsqf[s]){
+	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
@@ -354,17 +352,12 @@ public:
 	}
      }

-      if ( all_converged || k == MaxIterationsMshift-1){
+      if ( all_converged ){

 	SolverTimer.Stop();
-
-	if ( all_converged ){
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
-	} else {
-	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
-	}
-	
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+      
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
@@ -405,10 +398,12 @@ public:

 	return;
      }
+
   
    }
+    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    assert(0);
+    //  assert(0);
  }

 };
--- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
@@ -48,7 +48,7 @@ public:
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  GridBase* SinglePrecGrid;
-  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
+  RealD Delta; //reliable update parameter

  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
  LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,9 +65,7 @@ public:
      ErrorOnNoConverge(err_on_no_conv),
      DoFinalCleanup(true),
      Linop_fallback(NULL)
-  {
-    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
-  };
+  {};

  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
    Linop_fallback = &_Linop_fallback;
@@ -75,7 +73,6 @@ public:
  }
    
  void operator()(const FieldD &src, FieldD &psi) {
-    GRID_TRACE("ConjugateGradientReliableUpdate");
    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
    bool using_fallback = false;
      
@@ -118,12 +115,9 @@ public:
    }

    //Single prec initialization
-    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
-    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
-    
    FieldF r_f(SinglePrecGrid);
    r_f.Checkerboard() = r.Checkerboard();
-    precisionChange(r_f, r, pc_wk_dp_to_sp);
+    precisionChange(r_f, r);

    FieldF psi_f(r_f);
    psi_f = Zero();
@@ -139,8 +133,7 @@ public:
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
-    GridStopWatch PrecChangeTimer;
-    
+
    SolverTimer.Start();
    int k = 0;
    int l = 0;
@@ -179,9 +172,7 @@ public:
      // Stopping condition
      if (cp <= rsq) {
 	//Although not written in the paper, I assume that I have to add on the final solution
-	PrecChangeTimer.Start();
-	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
-	PrecChangeTimer.Stop();
+	precisionChange(mmp, psi_f);
 	psi = psi + mmp;
 	
 	
@@ -202,10 +193,7 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;

-	
 	IterationsToComplete = k;	
 	ReliableUpdatesPerformed = l;
 	  
@@ -225,21 +213,14 @@ public:
      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	PrecChangeTimer.Start();
-	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
-	PrecChangeTimer.Stop();
+	precisionChange(mmp, psi_f);
 	psi = psi + mmp;

-	MatrixTimer.Start();
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
-	MatrixTimer.Stop();
-	
 	r = src - mmp;

 	psi_f = Zero();
-	PrecChangeTimer.Start();
-	precisionChange(r_f, r, pc_wk_dp_to_sp);
-	PrecChangeTimer.Stop();
+	precisionChange(r_f, r);
 	cp = norm2(r);
 	MaxResidSinceLastRelUp = cp;

--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -33,19 +33,16 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
-  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
-  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
-  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };

@@ -60,7 +57,6 @@ private:
  const unsigned int       N;

 public:
-  using LinearFunction<Field>::operator();

  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@@ -91,7 +87,6 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  
-  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
@@ -113,43 +108,7 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-
-  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
-    int Nevec = (int)evec_coarse.size();
-    int Nsrc = (int)src.size();
-    // make temp variables
-    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
-    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
-    //Preporcessing
-    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    guess_coarse[j] = Zero();
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockProject(src_coarse[j],src[j],subspace);
-    }
-    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
-    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
-    for (int i=0;i<Nevec;i++)
-    {
-      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
-      const CoarseField & tmp = evec_coarse[i];
-      for (int j=0;j<Nsrc;j++)
-      {
-        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
-      }
-    }
-    //postprocessing
-    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockPromote(guess_coarse[j],guess[j],subspace);
-    guess[j].Checkerboard() = src[j].Checkerboard();
-    }
-  };
-
-  };
+};



--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -79,16 +79,14 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public Imp
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);

    std::cout.precision(13);
-
-    int conv=0;
-    if( (vv<eresid*eresid) ) conv = 1;
-
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<" target " << eresid*eresid << " conv " <<conv
 	     <<std::endl;

+    int conv=0;
+    if( (vv<eresid*eresid) ) conv = 1;
+
    return conv;
  }
 };
@@ -421,15 +419,14 @@ until convergence
 	}
      }

-      if ( Nconv < Nstop ) {
+      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
-	std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
-      }
+
      eval=eval2;
      
      //Keep only converged
-      eval.resize(Nstop);// was Nconv
-      evec.resize(Nstop,grid);// was Nconv
+      eval.resize(Nconv);// Nstop?
+      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
      
    }
@@ -459,7 +456,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
-    std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -467,7 +464,7 @@ until convergence

    Field& evec_k = evec[k];

-    _PolyOp(evec_k,w);    std::cout<<GridLogDebug << "PolyOp" <<std::endl;
+    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;

    if(k>0) w -= lme[k-1] * evec[k-1];

@@ -482,18 +479,18 @@ until convergence
    lme[k] = beta;

    if ( (k>0) && ( (k % orth_period) == 0 )) {
-      std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
    }

    if(k < Nm-1) evec[k+1] = w;

-    std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;

-    std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
  }

  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -44,7 +44,6 @@ public:
 				  int, MinRes);    // Must restart
 };

-//This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -68,7 +67,6 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
-  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -99,7 +97,6 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
-  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -146,24 +143,16 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
-
-  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
-                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
-                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
-                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
-                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3,
-					   int largestEvalIdxForReport=-1) 
+					   RealD coarse_relax_tol=5.0e3) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
+      _coarse_relax_tol(coarse_relax_tol)  
  {    };

-  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -186,26 +175,12 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;

-    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
-      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
-      RealD tmp_eval;
-      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
-    }
-    
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-
-  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
-  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
-
-  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
-  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
-  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
-  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
+  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
-    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -224,13 +199,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
-    
+
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -308,10 +283,6 @@ public:
    evals_coarse.resize(0);
  };

-  //The block inner product is the inner product on the fine grid locally summed over the blocks
-  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
-  //vectors under the block inner product. This step must be performed after computing the fine grid
-  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -355,8 +326,6 @@ public:
    }
  }

-  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
-  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -405,31 +374,25 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
-
-
-  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
-  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
-  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
+    Chebyshev<FineField>                          Cheby(cheby_op);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////

-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);

    CoarseField src(_CoarseGrid);     src=1.0; 

-    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -440,14 +403,6 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
-
-  //Get the fine eigenvector 'i' by reconstruction
-  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
-    blockPromote(evec_coarse[i],evec,subspace);  
-    eval = evals_coarse[i];
-  }
-    
-    
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public LinearFunction<Field>{
+template<class Field> class NormalEquations {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -60,7 +60,7 @@ public:
  }     
 };

-template<class Field> class HPDSolver : public LinearFunction<Field> {
+template<class Field> class HPDSolver {
 private:
  LinearOperatorBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
@@ -78,13 +78,13 @@ public:
  void operator() (const Field &in, Field &out){
 
    _Guess(in,out);
-    _HermitianSolver(_Matrix,in,out);  //M out = in
+    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in

  }     
 };


-template<class Field> class MdagMSolver : public LinearFunction<Field> {
+template<class Field> class MdagMSolver {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
-    const int _MAX_ITER_EST_ = 100; 
+    const int _MAX_ITER_EST_ = 50; 

    for (int i=0;i<_MAX_ITER_EST_;i++) { 
      
@@ -29,8 +29,6 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
-
-      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-  using LinearFunction<Field>::operator();
+
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-  using LinearFunction<Field>::operator();
+
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,8 +119,7 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
-    ComplexD a, b;
-    //    ComplexD zAz;
+    ComplexD a, b, zAz;
    RealD zAAz;
    ComplexD rq;

@@ -147,7 +146,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    //    zAz = innerProduct(Az,psi);
+    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
    
@@ -171,7 +170,7 @@ public:

    LinalgTimer.Start();

-    //    zAz = innerProduct(Az,psi);
+    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);

    //p[0],q[0],qq[0] 
@@ -213,7 +212,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      //      zAz = innerProduct(Az,psi);
+      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);

      LinalgTimer.Start();
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -40,7 +40,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
-   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
+   * L^{-dag}= ( 1      -Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
@@ -82,7 +82,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
-   * TODO: Deflation 
+   *
+   *
   */
 namespace Grid {

@@ -97,6 +98,7 @@ namespace Grid {
  protected:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
+    
    int CBfactorise;
    bool subGuess;
    bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
@@ -219,13 +221,20 @@ namespace Grid {
 	/////////////////////////////////////////////////
 	// Check unprec residual if possible
 	/////////////////////////////////////////////////
-	if ( ! subGuess ) {
-	  _Matrix.M(out[b],resid); 
+	if ( ! subGuess ) {	  
+
+	  if ( this->adjoint() ) _Matrix.Mdag(out[b],resid); 
+	  else                   _Matrix.M(out[b],resid); 
+
 	  resid = resid-in[b];
 	  RealD ns = norm2(in[b]);
 	  RealD nr = norm2(resid);
 	
-	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
+	  if ( this->adjoint() ) 
+	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	  else                   
+	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
 	}
@@ -279,12 +288,21 @@ namespace Grid {

      // Verify the unprec residual
      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
+
+	std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
+	
+	if ( this->adjoint() ) _Matrix.Mdag(out,resid); 
+	else                   _Matrix.M(out,resid); 
+
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);

-        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
+	  if ( this->adjoint() ) 
+	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
+	  else                   
+	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
+
      } else {
        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
@@ -293,6 +311,7 @@ namespace Grid {
    /////////////////////////////////////////////////////////////
    // Override in derived. 
    /////////////////////////////////////////////////////////////
+    virtual bool adjoint(void) { return false; }
    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
@@ -646,6 +665,127 @@ namespace Grid {
        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
      }
  };
+
+  /*
+   * Red black Schur decomposition
+   *
+   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
+   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
+   *                =         L                     D                     U
+   *
+   * L^-1 = (1              0 )
+   *        (-MoeMee^{-1}   1 )   
+   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
+   *           ( 0       1                    )
+   *
+   * U^-1 = (1   -Mee^{-1} Meo)
+   *        (0    1           )
+   * U^{dag} = ( 1                 0)
+   *           (Meo^dag Mee^{-dag} 1)
+   * U^{-dag} = (  1                 0)
+   *            (-Meo^dag Mee^{-dag} 1)
+   *
+   *
+   ***********************
+   *     M^dag psi = eta
+   ***********************
+   *
+   * Really for Mobius: (Wilson - easier to just use gamma 5 hermiticity)
+   *
+   *    Mdag psi     =         Udag  Ddag  Ldag psi = eta
+   *
+   * U^{-dag} = (  1                 0)
+   *            (-Meo^dag Mee^{-dag} 1)
+   *
+   *
+   * i)                D^dag phi =  (U^{-dag}  eta)
+   *                        eta'_e = eta_e
+   *                        eta'_o = (eta_o - Meo^dag Mee^{-dag} eta_e)
+   * 
+   *      phi_o = D_oo^-dag eta'_o = D_oo^-dag (eta_o - Meo^dag Mee^{-dag} eta_e)
+   *
+   *      phi_e = D_ee^-dag eta'_e = D_ee^-dag eta_e
+   * 
+   * Solve: 
+   *
+   *      D_oo D_oo^dag phi_o = D_oo (eta_o - Meo^dag Mee^{-dag} eta_e)
+   *
+   * ii) 
+   *      phi = L^dag psi => psi = L^-dag phi. 
+   *
+   * L^{-dag} = ( 1      -Mee^{-dag} Moe^{dag} )
+   *            ( 0       1                    )
+   *
+   *   => sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
+   *   => sol_o = phi_o
+   */
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal has Mooee on it, but solve the Adjoint system
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagMooeeDagSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    virtual bool adjoint(void) { return true; }
+    SchurRedBlackDiagMooeeDagSolve(OperatorFunction<Field> &HermitianRBSolver,
+				   const bool initSubGuess = false,
+				   const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe^dag MeeInvDag source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInvDag(src_e,tmp);  assert(  tmp.Checkerboard() ==Even);
+      _Matrix.MeooeDag   (tmp,Mtmp);   assert( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+
+      // get the right Mpc
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      _HermOpEO.Mpc(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field  sol_e(grid);
+      Field  tmp(grid);
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
+      // sol_o = phi_o
+      ///////////////////////////////////////////////////
+      _Matrix.MeooeDag(sol_o,tmp);      assert(tmp.Checkerboard()==Even);
+      tmp = src_e-tmp;                  assert(tmp.Checkerboard()==Even);
+      _Matrix.MooeeInvDag(tmp,sol_e);   assert(sol_e.Checkerboard()==Even);
+      
+      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
+    }
+  };
+
 }

 #endif
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@@ -1,381 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/Aggregates.h
-
-    Copyright (C) 2015
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-inline RealD AggregatePowerLaw(RealD x)
-{
-  //  return std::pow(x,-4);
-  //  return std::pow(x,-3);
-  return std::pow(x,-5);
-}
-
-template<class Fobj,class CComplex,int nbasis>
-class Aggregation {
-public:
-  typedef iVector<CComplex,nbasis >             siteVector;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-
-  GridBase *CoarseGrid;
-  GridBase *FineGrid;
-  std::vector<Lattice<Fobj> > subspace;
-  int checkerboard;
-  int Checkerboard(void){return checkerboard;}
-  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
-    CoarseGrid(_CoarseGrid),
-    FineGrid(_FineGrid),
-    subspace(nbasis,_FineGrid),
-    checkerboard(_checkerboard)
-  {
-  };
-  
-  
-  void Orthogonalise(void){
-    CoarseScalar InnerProd(CoarseGrid); 
-    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-  } 
-  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-    blockProject(CoarseVec,FineVec,subspace);
-  }
-  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-    FineVec.Checkerboard() = subspace[0].Checkerboard();
-    blockPromote(CoarseVec,FineVec,subspace);
-  }
-
-  virtual void CreateSubspaceRandom(GridParallelRNG  &RNG) {
-    int nn=nbasis;
-    RealD scale;
-    FineField noise(FineGrid);
-    for(int b=0;b<nn;b++){
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      subspace[b] = noise;
-    }
-  }
-  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
-  {
-
-    RealD scale;
-
-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-      
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-      
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      for(int i=0;i<1;i++){
-
-	CG(hermop,noise,subspace[b]);
-
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
-  // and this is the best I found
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
-	      <<ordermin<<" step "<<orderstep
-	      <<" lo"<<filterlo<<std::endl;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-      b++;
-    }
-
-    // Generate a full sequence of Chebyshevs
-    {
-      lo=filterlo;
-      noise=Mn;
-
-      FineField T0(FineGrid); T0 = noise;  
-      FineField T1(FineGrid); 
-      FineField T2(FineGrid);
-      FineField y(FineGrid);
-      
-      FineField *Tnm = &T0;
-      FineField *Tn  = &T1;
-      FineField *Tnp = &T2;
-
-      // Tn=T1 = (xscale M + mscale)in
-      RealD xscale = 2.0/(hi-lo);
-      RealD mscale = -(hi+lo)/(hi-lo);
-      hermop.HermOp(T0,y);
-      T1=y*xscale+noise*mscale;
-
-      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
-	
-	hermop.HermOp(*Tn,y);
-
-	autoView( y_v , y, AcceleratorWrite);
-	autoView( Tn_v , (*Tn), AcceleratorWrite);
-	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
-	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
-	const int Nsimd = CComplex::Nsimd();
-	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
-	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
-	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
-        });
-
-	// Possible more fine grained control is needed than a linear sweep,
-	// but huge productivity gain if this is simple algorithm and not a tunable
-	int m =1;
-	if ( n>=ordermin ) m=n-ordermin;
-	if ( (m%orderstep)==0 ) { 
-	  Mn=*Tnp;
-	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
-	  subspace[b] = Mn;
-	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-	  b++;
-	}
-
-	// Cycle pointers to avoid copies
-	FineField *swizzle = Tnm;
-	Tnm    =Tn;
-	Tn     =Tnp;
-	Tnp    =swizzle;
-	  
-      }
-    }
-    assert(b==nn);
-  }
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
-
-
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
-      Cheb(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-
-      // Refine
-      Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
-      noise = Mn;
-      PowerLaw(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-
-      // normalise
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-    }
-
-  }
-
-  virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-					       int nn,
-					       double hi,
-					       int orderfilter
-					       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
-    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
-
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-      // Filter
-      Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
-      Cheb(hermop,noise,Mn);
-      // normalise
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-    }
-
-  }
-
-  virtual void CreateSubspaceMultishift(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-					double Lo,double tol,int maxit)
-  {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-
-    // New normalised noise
-    std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
-
-    // Filter
-    // [ 1/6(x+Lo)  - 1/2(x+2Lo) + 1/2(x+3Lo)  -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
-    //
-    // 1/(x+Lo)  - 1/(x+2 Lo)
-    double epsilon      = Lo/3;
-    std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
-    std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
-    std::vector<RealD> tols({tol,tol,tol,tol});
-    std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
-
-    MultiShiftFunction msf(4,0.0,95.0);
-    std::cout << "msf constructed "<<std::endl;
-    msf.poles=shifts;
-    msf.residues=alpha;
-    msf.tolerances=tols;
-    msf.norm=0.0;
-    msf.order=alpha.size();
-    ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
-    
-    for(int b =0;b<nbasis;b++)
-    {
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-
-      // Initial matrix element
-      hermop.Op(noise,Mn);
-      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      MSCG(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp); 
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
-
-    }
-
-  }
-  virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
-			      double Lo,double tol,int maxit)
-  {
-    FineField tmp(FineGrid);
-    for(int b =0;b<nbasis;b++)
-    {
-      RealD MirsShift = Lo;
-      ConjugateGradient<FineField>  CGsloppy(tol,maxit,false);
-      ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,MirsShift);
-      CGsloppy(hermop,subspace[b],tmp);
-      subspace[b]=tmp;
-    }
-  }
-
-  
-  
-};
-NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/BatchedBlas.h
+++ b/Grid/algorithms/multigrid/BatchedBlas.h
@@ -1,537 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: BatchedBlas.h
-
-    Copyright (C) 2023
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#ifdef GRID_HIP
-#include <hipblas/hipblas.h>
-#endif
-#ifdef GRID_CUDA
-#include <hipblas/hipblas.h>
-#endif
-#ifdef GRID_SYCL
-#error // need oneMKL version
-#endif
-
-///////////////////////////////////////////////////////////////////////	  
-// Need to rearrange lattice data to be in the right format for a
-// batched multiply. Might as well make these static, dense packed
-///////////////////////////////////////////////////////////////////////
-NAMESPACE_BEGIN(Grid);
-#ifdef GRID_HIP
-  typedef hipblasHandle_t gridblasHandle_t;
-#endif
-#ifdef GRID_CUDA
-  typedef cudablasHandle_t gridblasHandle_t;
-#endif
-#ifdef GRID_SYCL
-  typedef int32_t gridblasHandle_t;
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-  typedef int32_t gridblasHandle_t;
-#endif
-
-class GridBLAS {
-public:
-
-  static gridblasHandle_t gridblasHandle;
-  static int            gridblasInit;
-  
-  static void Init(void)
-  {
-    if ( ! gridblasInit ) {
-#ifdef GRID_CUDA
-      std::cout << "cublasCreate"<<std::endl;
-      cublasCreate(&gridblasHandle);
-#endif
-#ifdef GRID_HIP
-      std::cout << "hipblasCreate"<<std::endl;
-      hipblasCreate(&gridblasHandle);
-#endif
-#ifdef GRID_SYCL
-#endif
-    }
-  }
-  
-  // Force construct once
-  GridBLAS() { Init(); };
-  ~GridBLAS() { };
-  
-  /////////////////////////////////////////////////////////////////////////////////////
-  // BLAS GEMM conventions:
-  /////////////////////////////////////////////////////////////////////////////////////
-  // - C = alpha A * B + beta C
-  // Dimensions:
-  // - C_m.n
-  // - A_m.k
-  // - B_k.n
-  // - Flops = 8 M N K
-  // - Bytes = 2*sizeof(word) * (MN+MK+KN)
-  // M=60, N=12
-  // Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
-  /////////////////////////////////////////////////////////////////////////////////////
-  void synchronise(void)
-  {
-#ifdef GRID_HIP
-    auto err = hipDeviceSynchronize();
-    assert(err==hipSuccess);
-#endif
-#ifdef GRID_CUDA
-    auto err = cudaDeviceSynchronize();
-    assert(err==cudaSuccess);
-#endif
-#ifdef GRID_SYCL
-    accelerator_barrier();
-#endif
-  }
-  void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
-  {
-    int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
-    int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
-    int32_t N_C = nbasis*nrhs*coarseVol*nstencil; 
-    deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
-    deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
-    deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
-    ComplexD alpha(1.0);
-    ComplexD beta (1.0);
-    for(int i=0;i<10;i++){
-      RealD t0 = usecond();
-      for(int s=0;s<nstencil;s++){
-	gemmStridedBatched(nbasis,nrhs,nbasis,
-			   alpha,
-			   &A[0], // m x k 
-			   &B[0], // k x n
-			   beta, 
-			   &C[0], // m x n
-			   coarseVol);
-      }
-      synchronise();
-      RealD t1 = usecond();
-      RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
-      RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
-      std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-      std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-    }
-  }
-
-  void gemmBatched(int m,int n, int k,
-		   ComplexD alpha,
-		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices
-		   deviceVector<ComplexD*> &Bkn,
-		   ComplexD beta,
-		   deviceVector<ComplexD*> &Cmn)
-  {
-    RealD t2=usecond();
-    int32_t batchCount = Amk.size();
-    // Use C-row major storage, so transpose calls
-    int lda = m; // m x k column major
-    int ldb = k; // k x n column major
-    int ldc = m; // m x b column major
-    static deviceVector<ComplexD> alpha_p(1);
-    static deviceVector<ComplexD> beta_p(1);
-    // can prestore the 1 and the zero on device
-    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
-    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
-    RealD t0=usecond();
-    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
-#ifdef GRID_HIP
-    auto err = hipblasZgemmBatched(gridblasHandle,
-				   HIPBLAS_OP_N,
-				   HIPBLAS_OP_N,
-				   m,n,k,
-				   (hipblasDoubleComplex *) &alpha_p[0],
-				   (hipblasDoubleComplex **)&Amk[0], lda,
-				   (hipblasDoubleComplex **)&Bkn[0], ldb,
-				   (hipblasDoubleComplex *) &beta_p[0],
-				   (hipblasDoubleComplex **)&Cmn[0], ldc,
-				   batchCount);
-    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
-    assert(err==HIPBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_CUDA
-    auto err = cublasZgemmBatched(gridblasHandle,
-				  CUBLAS_OP_N,
-				  CUBLAS_OP_N,
-				  m,n,k,
-				  (cuDoubleComplex *) &alpha_p[0],
-				  (cuDoubleComplex **)&Amk[0], lda,
-				  (cuDoubleComplex **)&Bkn[0], ldb,
-				  (cuDoubleComplex *) &beta_p[0],
-				  (cuDoubleComplex **)&Cmn[0], ldc,
-				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  ComplexD c_mn(0.0);
-	  for (int kk = 0; kk < k, ++kk)
-	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
-	}
-      }
-    }
-#endif
-     RealD t1=usecond();
-     RealD flops = 8.0*m*n*k*batchCount;
-     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
-     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-  }
-
-  void gemmBatched(int m,int n, int k,
-		   ComplexF alpha,
-		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices
-		   deviceVector<ComplexF*> &Bkn,
-		   ComplexF beta,
-		   deviceVector<ComplexF*> &Cmn)
-  {
-    RealD t2=usecond();
-    int32_t batchCount = Amk.size();
-    // Use C-row major storage, so transpose calls
-    int lda = m; // m x k column major
-    int ldb = k; // k x n column major
-    int ldc = m; // m x b column major
-    static deviceVector<ComplexF> alpha_p(1);
-    static deviceVector<ComplexF> beta_p(1);
-    // can prestore the 1 and the zero on device
-    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
-    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
-    RealD t0=usecond();
-    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
-#ifdef GRID_HIP
-    auto err = hipblasCgemmBatched(gridblasHandle,
-				   HIPBLAS_OP_N,
-				   HIPBLAS_OP_N,
-				   m,n,k,
-				   (hipblasComplex *) &alpha_p[0],
-				   (hipblasComplex **)&Amk[0], lda,
-				   (hipblasComplex **)&Bkn[0], ldb,
-				   (hipblasComplex *) &beta_p[0],
-				   (hipblasComplex **)&Cmn[0], ldc,
-				   batchCount);
-    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
-    assert(err==HIPBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_CUDA
-    auto err = cublasCgemmBatched(gridblasHandle,
-				  CUBLAS_OP_N,
-				  CUBLAS_OP_N,
-				  m,n,k,
-				  (cuComplex *) &alpha_p[0],
-				  (cuComplex **)&Amk[0], lda,
-				  (cuComplex **)&Bkn[0], ldb,
-				  (cuComplex *) &beta_p[0],
-				  (cuComplex **)&Cmn[0], ldc,
-				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  ComplexD c_mn(0.0);
-	  for (int kk = 0; kk < k, ++kk)
-	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
-	}
-      }
-    }
-#endif
-     RealD t1=usecond();
-     RealD flops = 8.0*m*n*k*batchCount;
-     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
-     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-  }
-  
-  ///////////////////////////////////////////////////////////////////////////
-  // Single precision real GEMM
-  ///////////////////////////////////////////////////////////////////////////
-
-  void gemmBatched(int m,int n, int k,
-		   RealF alpha,
-		   deviceVector<RealF*> &Amk,  // pointer list to matrices
-		   deviceVector<RealF*> &Bkn,
-		   RealF beta,
-		   deviceVector<RealF*> &Cmn)
-  {
-    RealD t2=usecond();
-    int32_t batchCount = Amk.size();
-    // Use C-row major storage, so transpose calls
-    int lda = m; // m x k column major
-    int ldb = k; // k x n column major
-    int ldc = m; // m x b column major
-    static deviceVector<RealF> alpha_p(1);
-    static deviceVector<RealF> beta_p(1);
-    // can prestore the 1 and the zero on device
-    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
-    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
-    RealD t0=usecond();
-    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
-#ifdef GRID_HIP
-    auto err = hipblasSgemmBatched(gridblasHandle,
-				   HIPBLAS_OP_N,
-				   HIPBLAS_OP_N,
-				   m,n,k,
-				   (float *) &alpha_p[0],
-				   (float **)&Amk[0], lda,
-				   (float **)&Bkn[0], ldb,
-				   (float *) &beta_p[0],
-				   (float **)&Cmn[0], ldc,
-				   batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_CUDA
-    auto err = cublasSgemmBatched(gridblasHandle,
-				  CUBLAS_OP_N,
-				  CUBLAS_OP_N,
-				  m,n,k,
-				  (float *) &alpha_p[0],
-				  (float **)&Amk[0], lda,
-				  (float **)&Bkn[0], ldb,
-				  (float *) &beta_p[0],
-				  (float **)&Cmn[0], ldc,
-				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_SYCL
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k, ++kk)
-	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
-	}
-      }
-    }
-#endif
-     RealD t1=usecond();
-     RealD flops = 2.0*m*n*k*batchCount;
-     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
-     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-  }
-  
-  
-  ///////////////////////////////////////////////////////////////////////////
-  // Double precision real GEMM
-  ///////////////////////////////////////////////////////////////////////////
-
-  void gemmBatched(int m,int n, int k,
-		   RealD alpha,
-		   deviceVector<RealD*> &Amk,  // pointer list to matrices
-		   deviceVector<RealD*> &Bkn,
-		   RealD beta,
-		   deviceVector<RealD*> &Cmn)
-  {
-    RealD t2=usecond();
-    int32_t batchCount = Amk.size();
-    // Use C-row major storage, so transpose calls
-    int lda = m; // m x k column major
-    int ldb = k; // k x n column major
-    int ldc = m; // m x b column major
-    static deviceVector<RealD> alpha_p(1);
-    static deviceVector<RealD> beta_p(1);
-    // can prestore the 1 and the zero on device
-    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
-    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
-    RealD t0=usecond();
-    //       std::cout << "hipblasZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
-#ifdef GRID_HIP
-    auto err = hipblasDgemmBatched(gridblasHandle,
-				   HIPBLAS_OP_N,
-				   HIPBLAS_OP_N,
-				   m,n,k,
-				   (double *) &alpha_p[0],
-				   (double **)&Amk[0], lda,
-				   (double **)&Bkn[0], ldb,
-				   (double *) &beta_p[0],
-				   (double **)&Cmn[0], ldc,
-				   batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_CUDA
-    auto err = cublasDgemmBatched(gridblasHandle,
-				  CUBLAS_OP_N,
-				  CUBLAS_OP_N,
-				  m,n,k,
-				  (double *) &alpha_p[0],
-				  (double **)&Amk[0], lda,
-				  (double **)&Bkn[0], ldb,
-				  (double *) &beta_p[0],
-				  (double **)&Cmn[0], ldc,
-				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_SYCL
-    /*
-      int64_t m64=m;
-      int64_t n64=n;
-      int64_t k64=k;
-      int64_t batchCount64=batchCount;
-      oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
-      onemkl::transpose::N,
-      onemkl::transpose::N,
-      &m64,&n64,&k64,
-      (double *) &alpha_p[0],
-      (double **)&Amk[0], lda,
-      (double **)&Bkn[0], ldb,
-      (double *) &beta_p[0],
-      (double **)&Cmn[0], ldc,
-      1,&batchCount64);
-     */
-    //MKL’s cblas_<T>gemm_batch & OneAPI
-#warning "oneMKL implementation not built "
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    // Need a default/reference implementation
-    for (int p = 0; p < batchCount; ++p) {
-      for (int mm = 0; mm < m; ++mm) {
-	for (int nn = 0; nn < n; ++nn) {
-	  RealD c_mn(0.0);
-	  for (int kk = 0; kk < k, ++kk)
-	    c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	  Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
-	}
-      }
-    }
-#endif
-     RealD t1=usecond();
-     RealD flops = 2.0*m*n*k*batchCount;
-     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
-     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-  }
-  
-
-  
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // Strided case used by benchmark, but generally unused in Grid
-  // Keep a code example in double complex, but don't generate the single and real variants for now
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  
-  void gemmStridedBatched(int m,int n, int k,
-			  ComplexD alpha,
-			  ComplexD* Amk,  // pointer list to matrices
-			  ComplexD* Bkn,
-			  ComplexD beta,
-			  ComplexD* Cmn,
-			  int batchCount)
-  {
-    // Use C-row major storage, so transpose calls
-    int lda = m; // m x k column major
-    int ldb = k; // k x n column major
-    int ldc = m; // m x b column major
-    int sda = m*k;
-    int sdb = k*n;
-    int sdc = m*n;
-    deviceVector<ComplexD> alpha_p(1);
-    deviceVector<ComplexD> beta_p(1);
-    acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
-    acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
-    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
-    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl;
-    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl;
-#ifdef GRID_HIP
-    auto err = hipblasZgemmStridedBatched(gridblasHandle,
-					  HIPBLAS_OP_N,
-					  HIPBLAS_OP_N,
-					  m,n,k,
-					  (hipblasDoubleComplex *) &alpha_p[0],
-					  (hipblasDoubleComplex *) Amk, lda, sda,
-					  (hipblasDoubleComplex *) Bkn, ldb, sdb,
-					  (hipblasDoubleComplex *) &beta_p[0],
-					  (hipblasDoubleComplex *) Cmn, ldc, sdc,
-					  batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
-#endif
-#ifdef GRID_CUDA
-    cublasZgemmStridedBatched(gridblasHandle,
-			      CUBLAS_OP_N,
-			      CUBLAS_OP_N,
-			      m,n,k,
-			      (cuDoubleComplex *) &alpha_p[0],
-			      (cuDoubleComplex *) Amk, lda, sda,
-			      (cuDoubleComplex *) Bkn, ldb, sdb,
-			      (cuDoubleComplex *) &beta_p[0],
-			      (cuDoubleComplex *) Cmn, ldc, sdc,
-			      batchCount);
-#endif
-#ifdef GRID_SYCL
-     #warning "oneMKL implementation not made "
-#endif
-#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-     // Need a default/reference implementation
-     for (int p = 0; p < batchCount; ++p) {
-       for (int mm = 0; mm < m; ++mm) {
-	 for (int nn = 0; nn < n; ++nn) {
-	   ComplexD c_mn(0.0);
-	   for (int kk = 0; kk < k, ++kk)
-	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
-	   Cmn[mm + nn*ldc + p*sdc] =  (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
-	 }
-       }
-     }
-#endif
-  }
-
-
-
-
-};
-
-NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -1,467 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
-
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// Fine Object == (per site) type of fine field
-// nbasis      == number of deflation vectors
-template<class Fobj,class CComplex,int nbasis>
-class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
-public:
-
-  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
-  typedef iVector<CComplex,nbasis >           siteVector;
-  typedef iMatrix<CComplex,nbasis >           siteMatrix;
-  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-  typedef iMatrix<CComplex,nbasis >  Cobj;
-  typedef iVector<CComplex,nbasis >  Cvec;
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-  typedef Lattice<CComplex >    FineComplexField;
-  typedef CoarseVector Field;
-  ////////////////////
-  // Data members
-  ////////////////////
-  int hermitian;
-  GridBase      *       _FineGrid; 
-  GridCartesian *       _CoarseGrid; 
-  NonLocalStencilGeometry &geom;
-  PaddedCell Cell;
-  GeneralLocalStencil Stencil;
-  
-  std::vector<CoarseMatrix> _A;
-  std::vector<CoarseMatrix> _Adag;
-  std::vector<CoarseVector> MultTemporaries;
-
-  ///////////////////////
-  // Interface
-  ///////////////////////
-  GridBase      * Grid(void)           { return _CoarseGrid; };   // this is all the linalg routines need to know
-  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
-  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
-
-  void ShiftMatrix(RealD shift)
-  {
-    int Nd=_FineGrid->Nd(); 
-    Coordinate zero_shift(Nd,0);
-    for(int p=0;p<geom.npoint;p++){
-      if ( zero_shift==geom.shifts[p] ) {
-	_A[p] = _A[p]+shift;
-	_Adag[p] = _Adag[p]+shift;
-      }
-    }    
-  }
-  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
-  {
-    int nfound=0;
-    std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
- 	// Search for the same relative shift
-	// Avoids brutal handling of Grid pointers
-	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
-	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
-	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
-	  nfound++;
-	}
-      }
-    }
-    assert(nfound==geom.npoint);
-    ExchangeCoarseLinks();
-  }
-  
-  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
-    : geom(_geom),
-      _FineGrid(FineGrid),
-      _CoarseGrid(CoarseGrid),
-      hermitian(1),
-      Cell(_geom.Depth(),_CoarseGrid),
-      Stencil(Cell.grids.back(),geom.shifts)
-  {
-    {
-      int npoint = _geom.npoint;
-    }
-    _A.resize(geom.npoint,CoarseGrid);
-    _Adag.resize(geom.npoint,CoarseGrid);
-  }
-  void M (const CoarseVector &in, CoarseVector &out)
-  {
-    Mult(_A,in,out);
-  }
-  void Mdag (const CoarseVector &in, CoarseVector &out)
-  {
-    if ( hermitian ) M(in,out);
-    else Mult(_Adag,in,out);
-  }
-  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
-  {
-    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
-    RealD tmult2=0;
-
-    ttot=-usecond();
-    conformable(CoarseGrid(),in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-    CoarseVector tin=in;
-
-    texch-=usecond();
-    CoarseVector pin = Cell.ExchangePeriodic(tin);
-    texch+=usecond();
-
-    CoarseVector pout(pin.Grid());
-
-    int npoint = geom.npoint;
-    typedef LatticeView<Cobj> Aview;
-    typedef LatticeView<Cvec> Vview;
-      
-    const int Nsimd = CComplex::Nsimd();
-    
-    int64_t osites=pin.Grid()->oSites();
-
-    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
-    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
-                + 2.0*osites*sizeof(siteVector)*npoint;
-      
-    {
-      tviews-=usecond();
-      autoView( in_v , pin, AcceleratorRead);
-      autoView( out_v , pout, AcceleratorWriteDiscard);
-      autoView( Stencil_v  , Stencil, AcceleratorRead);
-      tviews+=usecond();
-
-      // Static and prereserve to keep UVM region live and not resized across multiple calls
-      ttemps-=usecond();
-      MultTemporaries.resize(npoint,pin.Grid());       
-      ttemps+=usecond();
-      std::vector<Aview> AcceleratorViewContainer_h;
-      std::vector<Vview> AcceleratorVecViewContainer_h; 
-
-      tviews-=usecond();
-      for(int p=0;p<npoint;p++) {
-	AcceleratorViewContainer_h.push_back(      A[p].View(AcceleratorRead));
-	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
-      }
-      tviews+=usecond();
-
-      static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
-      static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
-      
-      auto Aview_p = &AcceleratorViewContainer[0];
-      auto Vview_p = &AcceleratorVecViewContainer[0];
-      tcopy-=usecond();
-      acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
-      acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
-      tcopy+=usecond();
-
-      tmult-=usecond();
-      accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
-	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
-	  int32_t ss   = spb/(nbasis*npoint);
-	  int32_t bp   = spb%(nbasis*npoint);
-	  int32_t point= bp/nbasis;
-	  int32_t b    = bp%nbasis;
-	  auto SE  = Stencil_v.GetEntry(point,ss);
-	  auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
-	  auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
-	  for(int bb=1;bb<nbasis;bb++) {
-	    res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
-	  }
-	  coalescedWrite(Vview_p[point][ss](b),res);
-      });
-      tmult2-=usecond();
-      accelerator_for(sb, osites*nbasis, Nsimd, {
-	  int ss = sb/nbasis;
-	  int b  = sb%nbasis;
-	  auto res = coalescedRead(Vview_p[0][ss](b));
-	  for(int point=1;point<npoint;point++){
-	    res = res + coalescedRead(Vview_p[point][ss](b));
-	  }
-	  coalescedWrite(out_v[ss](b),res);
-      });
-      tmult2+=usecond();
-      tmult+=usecond();
-      for(int p=0;p<npoint;p++) {
-	AcceleratorViewContainer_h[p].ViewClose();
-	AcceleratorVecViewContainer_h[p].ViewClose();
-      }
-    }
-
-    text-=usecond();
-    out = Cell.Extract(pout);
-    text+=usecond();
-    ttot+=usecond();
-    
-    std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<" of which mult2  "<<tmult2<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    //    std::cout << GridLogPerformance<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
-
-  };
-  
-  void PopulateAdag(void)
-  {
-    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
-      Coordinate bcoor;
-      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
-      
-      for(int p=0;p<geom.npoint;p++){
-	Coordinate scoor = bcoor;
-	for(int mu=0;mu<bcoor.size();mu++){
-	  int L = CoarseGrid()->GlobalDimensions()[mu];
-	  scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
-	}
-	// Flip to poke/peekLocalSite and not too bad
-	auto link = peekSite(_A[p],scoor);
-	int pp = geom.Reverse(p);
-	pokeSite(adj(link),_Adag[pp],bcoor);
-      }
-    }
-  }
-  /////////////////////////////////////////////////////////////
-  // 
-  // A) Only reduced flops option is to use a padded cell of depth 4
-  // and apply MpcDagMpc in the padded cell.
-  //
-  // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
-  // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
-  // Cost is 81x more, same as stencil size.
-  //
-  // But: can eliminate comms and do as local dirichlet.
-  //
-  // Local exchange gauge field once.
-  // Apply to all vectors, local only computation.
-  // Must exchange ghost subcells in reverse process of PaddedCell to take inner products
-  //
-  // B) Can reduce cost: pad by 1, apply Deo      (4^4+6^4+8^4+8^4 )/ (4x 4^4)
-  //                     pad by 2, apply Doe
-  //                     pad by 3, apply Deo
-  //                     then break out 8x directions; cost is ~10x MpcDagMpc per vector
-  //
-  // => almost factor of 10 in setup cost, excluding data rearrangement
-  //
-  // Intermediates -- ignore the corner terms, leave approximate and force Hermitian
-  // Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
-  /////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////
-    // BFM HDCG style approach: Solve a system of equations to get Aij
-    //////////////////////////////////////////////////////////
-    /*
-     *     Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
-     *
-     *     conj(phases[block]) proj[k][ block*Nvec+j ] =  \sum_ball  e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > 
-     *                                                 =  \sum_ball e^{iqk.delta} A_ji
-     *
-     *     Must invert matrix M_k,l = e^[i q_k . delta_l]
-     *
-     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-     */
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
-  {
-    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
-    GridBase *grid = FineGrid();
-
-    RealD tproj=0.0;
-    RealD teigen=0.0;
-    RealD tmat=0.0;
-    RealD tphase=0.0;
-    RealD tphaseBZ=0.0;
-    RealD tinv=0.0;
-
-    /////////////////////////////////////////////////////////////
-    // Orthogonalise the subblocks over the basis
-    /////////////////////////////////////////////////////////////
-    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
-
-    const int npoint = geom.npoint;
-      
-    Coordinate clatt = CoarseGrid()->GlobalDimensions();
-    int Nd = CoarseGrid()->Nd();
-
-      /*
-       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
-       *     Matrix index i is mapped to this shift via 
-       *               geom.shifts[i]
-       *
-       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
-       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
-       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
-       *       = M_{kl} A_ji^{b.b+l}
-       *
-       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
-       *  
-       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-       *
-       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
-       */
-    teigen-=usecond();
-    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
-    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
-    ComplexD ci(0.0,1.0);
-    for(int k=0;k<npoint;k++){ // Loop over momenta
-
-      for(int l=0;l<npoint;l++){ // Loop over nbr relative
-	ComplexD phase(0.0,0.0);
-	for(int mu=0;mu<Nd;mu++){
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
-	}
-	phase=exp(phase*ci);
-	Mkl(k,l) = phase;
-      }
-    }
-    invMkl = Mkl.inverse();
-    teigen+=usecond();
-
-    ///////////////////////////////////////////////////////////////////////
-    // Now compute the matrix elements of linop between the orthonormal
-    // set of vectors.
-    ///////////////////////////////////////////////////////////////////////
-    FineField phaV(grid); // Phased block basis vector
-    FineField MphaV(grid);// Matrix applied
-    std::vector<FineComplexField> phaF(npoint,grid);
-    std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
-    
-    CoarseVector coarseInner(CoarseGrid());
-    
-    typedef typename CComplex::scalar_type SComplex;
-    FineComplexField one(grid); one=SComplex(1.0);
-    FineComplexField zz(grid); zz = Zero();
-    tphase=-usecond();
-    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-      /////////////////////////////////////////////////////
-      // Stick a phase on every block
-      /////////////////////////////////////////////////////
-      CoarseComplexField coor(CoarseGrid());
-      pha[p]=Zero();
-      for(int mu=0;mu<Nd;mu++){
-	LatticeCoordinate(coor,mu);
-	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
-      }
-      pha[p]  =exp(pha[p]*ci);
-
-      blockZAXPY(phaF[p],pha[p],one,zz);
-      
-    }
-    tphase+=usecond();
-    
-    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
-    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
-    for(int i=0;i<nbasis;i++){// Loop over basis vectors
-      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
-      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-	tphaseBZ-=usecond();
-	phaV = phaF[p]*Subspace.subspace[i];
-	tphaseBZ+=usecond();
-
-	/////////////////////////////////////////////////////////////////////
-	// Multiple phased subspace vector by matrix and project to subspace
-	// Remove local bulk phase to leave relative phases
-	/////////////////////////////////////////////////////////////////////
-	tmat-=usecond();
-	linop.Op(phaV,MphaV);
-	tmat+=usecond();
-
-	tproj-=usecond();
-	blockProjectFast(coarseInner,MphaV,Subspace.subspace);
-	coarseInner = conjugate(pha[p]) * coarseInner;
-
-	ComputeProj[p] = coarseInner;
-	tproj+=usecond();
-
-      }
-
-      tinv-=usecond();
-      for(int k=0;k<npoint;k++){
-	FT[k] = Zero();
-	for(int l=0;l<npoint;l++){
-	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
-	}
-      
-	int osites=CoarseGrid()->oSites();
-	autoView( A_v  , _A[k], AcceleratorWrite);
-	autoView( FT_v  , FT[k], AcceleratorRead);
-	accelerator_for(sss, osites, 1, {
-	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](i,j) = FT_v[sss](j);
-	    }
-        });
-      }
-      tinv+=usecond();
-    }
-
-    // Only needed if nonhermitian
-    if ( ! hermitian ) {
-      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
-      PopulateAdag();
-    }
-
-    // Need to write something to populate Adag from A
-    ExchangeCoarseLinks();
-    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
-  }
-  void ExchangeCoarseLinks(void){
-    for(int p=0;p<geom.npoint;p++){
-      _A[p] = Cell.ExchangePeriodic(_A[p]);
-      _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
-    }
-  }
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
-};
-
-
-  
-NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -1,402 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include <Grid/algorithms/multigrid/BatchedBlas.h>
-
-NAMESPACE_BEGIN(Grid);
-
-
-// Move this to accelerator.h
-// Also give a copy device.
-// Rename acceleratorPut
-// Rename acceleratorGet
-template<class T> void deviceSet(T& dev,T&host)
-{
-  acceleratorCopyToDevice(&host,&dev,sizeof(T));
-}
-template<class T> T deviceGet(T& dev)
-{
-  T host;
-  acceleratorCopyFromDevice(&dev,&host,sizeof(T));
-  return host;
-}
-
-// Fine Object == (per site) type of fine field
-// nbasis      == number of deflation vectors
-template<class Fobj,class CComplex,int nbasis>
-class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
-public:
-  typedef typename CComplex::scalar_object SComplex;
-  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
-  typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
-
-  typedef iVector<CComplex,nbasis >           siteVector;
-  typedef iMatrix<CComplex,nbasis >           siteMatrix;
-  typedef iVector<SComplex,nbasis >           calcVector;
-  typedef iMatrix<SComplex,nbasis >           calcMatrix;
-  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-  typedef iMatrix<CComplex,nbasis >  Cobj;
-  typedef iVector<CComplex,nbasis >  Cvec;
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-  typedef CoarseVector Field;
-
-  ////////////////////
-  // Data members
-  ////////////////////
-  GridCartesian *       _CoarseGridMulti; 
-  GridCartesian *       _CoarseGrid;
-  GeneralCoarseOp &     _Op;
-  NonLocalStencilGeometry geom;
-  PaddedCell Cell;
-  GeneralLocalStencil Stencil;
-
-  deviceVector<calcVector> BLAS_B;
-  deviceVector<calcVector> BLAS_C;
-  std::vector<deviceVector<calcMatrix> > BLAS_A;
-
-  std::vector<deviceVector<ComplexD *> > BLAS_AP;
-  std::vector<deviceVector<ComplexD *> > BLAS_BP;
-  deviceVector<ComplexD *>               BLAS_CP;
-
-  ///////////////////////
-  // Interface
-  ///////////////////////
-  GridBase      * Grid(void)           { return _CoarseGridMulti; };   // this is all the linalg routines need to know
-  GridCartesian * CoarseGrid(void)     { return _CoarseGridMulti; };   // this is all the linalg routines need to know
-
-  MultiGeneralCoarsenedMatrix(GeneralCoarseOp & Op,GridCartesian *CoarseGridMulti) :
-    _Op(Op),
-    _CoarseGrid(Op.CoarseGrid()),
-    _CoarseGridMulti(CoarseGridMulti),
-    geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
-    Cell(Op.geom.Depth(),_CoarseGridMulti),
-    Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
-  {
-    int32_t padded_sites   = _Op._A[0].Grid()->lSites();
-    int32_t unpadded_sites = _CoarseGrid->lSites();
-    
-    int32_t nrhs  = CoarseGridMulti->FullDimensions()[0];  // # RHS
-    int32_t orhs  = nrhs/CComplex::Nsimd();
-
-    /////////////////////////////////////////////////
-    // Device data vector storage
-    /////////////////////////////////////////////////
-    BLAS_A.resize(geom.npoint);
-    for(int p=0;p<geom.npoint;p++){
-      BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
-    }
-    BLAS_B.resize(nrhs *padded_sites);   // includes ghost zone
-    BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
-
-    BLAS_AP.resize(geom.npoint);
-    BLAS_BP.resize(geom.npoint);
-    for(int p=0;p<geom.npoint;p++){
-      BLAS_AP[p].resize(unpadded_sites);
-      BLAS_BP[p].resize(unpadded_sites);
-    }
-    BLAS_CP.resize(unpadded_sites);
-
-    /////////////////////////////////////////////////
-    // Pointers to data
-    /////////////////////////////////////////////////
-
-    // Site identity mapping for A, C
-    for(int p=0;p<geom.npoint;p++){
-      for(int ss=0;ss<unpadded_sites;ss++){
-	ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
-	//ComplexD *ptr = (ComplexD *)&BLAS_A[p][0]; std::cout << " A ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_A[p].size()<<std::endl;
-	deviceSet(BLAS_AP[p][ss],ptr);
-      }
-    }
-    for(int ss=0;ss<unpadded_sites;ss++){
-      ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
-      //ComplexD *ptr = (ComplexD *)&BLAS_C[0];  std::cout << " C ptr "<<std::hex<<ptr<<std::dec<<" "<<ss<<"/"<<BLAS_C.size()<<std::endl;
-      deviceSet(BLAS_CP[ss],ptr);
-    }
-
-    /////////////////////////////////////////////////
-    // Neighbour table is more complicated
-    /////////////////////////////////////////////////
-    int32_t j=0; // Interior point counter (unpadded)
-    for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
-      int ghost_zone=0;
-      for(int32_t point = 0 ; point < geom.npoint; point++){
-	int i=s*orhs*geom.npoint+point;
-	if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
-	  ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
-	}
-      }
-      //      GeneralStencilEntryReordered tmp;
-      if( ghost_zone==0) {
-	for(int32_t point = 0 ; point < geom.npoint; point++){
-	  int i=s*orhs*geom.npoint+point;
- 	  int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
-	  //	  std::cout << " B ptr "<< nbr<<"/"<<BLAS_B.size()<<std::endl;
-	  assert(nbr<BLAS_B.size());
-	  ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
-	  //	  ComplexD * ptr = (ComplexD *)&BLAS_B[0];
-	  //	  std::cout << " B ptr unpadded "<<std::hex<<ptr<<std::dec<<" "<<s<<"/"<<padded_sites<<std::endl;
-	  //	  std::cout << " B ptr   padded "<<std::hex<<ptr<<std::dec<<" "<<j<<"/"<<unpadded_sites<<std::endl;
-	  deviceSet(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
-	  //	  auto tmp = deviceGet(*BLAS_BP[point][j]);  // debug trigger SEGV if bad ptr
-	}
-	j++;
-      }
-    }
-    assert(j==unpadded_sites);
-    CopyMatrix();
-  }
-  template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
-  {
-#if 0
-    std::vector<typename vobj::scalar_object> tmp;
-    unvectorizeToLexOrdArray(tmp,from);
-    assert(tmp.size()==from.Grid()->lSites());
-    assert(tmp.size()==to.size());
-    to.resize(tmp.size());
-    acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size());
-#else
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  GridBase *Fg = from.Grid();
-  assert(!Fg->_isCheckerBoarded);
-  int nd = Fg->_ndimension;
-
-  to.resize(Fg->lSites());
-
-  Coordinate LocalLatt = Fg->LocalDimensions();
-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  Coordinate f_ostride = Fg->_ostride;
-  Coordinate f_istride = Fg->_istride;
-  Coordinate f_rdimensions = Fg->_rdimensions;
-
-  autoView(from_v,from,AcceleratorRead);
-  auto to_v = &to[0];
-
-  const int words=sizeof(vobj)/sizeof(vector_type);
-  accelerator_for(idx,nsite,1,{
-      
-      Coordinate from_coor, base;
-      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
-      for(int i=0;i<nd;i++){
-	from_coor[i] = base[i];
-      }
-      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
-      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      scalar_type* to = (scalar_type *)&to_v[idx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	to[w] = stmp;
-      }
-    });
-#endif
-  }    
-  template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
-  {
-#if 0
-    std::vector<typename vobj::scalar_object> tmp;
-    tmp.resize(in.size());
-    //    std::cout << "BLAStoGrid volume " <<tmp.size()<<" "<< grid.Grid()->lSites()<<std::endl;
-    assert(in.size()==grid.Grid()->lSites());
-    acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
-    vectorizeFromLexOrdArray(tmp,grid);
-#else
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  GridBase *Tg = grid.Grid();
-  assert(!Tg->_isCheckerBoarded);
-  int nd = Tg->_ndimension;
-  
-  assert(in.size()==Tg->lSites());
-
-  Coordinate LocalLatt = Tg->LocalDimensions();
-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  Coordinate t_ostride = Tg->_ostride;
-  Coordinate t_istride = Tg->_istride;
-  Coordinate t_rdimensions = Tg->_rdimensions;
-
-  autoView(to_v,grid,AcceleratorWrite);
-  auto from_v = &in[0];
-
-  const int words=sizeof(vobj)/sizeof(vector_type);
-  accelerator_for(idx,nsite,1,{
-      
-      Coordinate to_coor, base;
-      Lexicographic::CoorFromIndex(base,idx,LocalLatt);
-      for(int i=0;i<nd;i++){
-	to_coor[i] = base[i];
-      }
-      int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
-      int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
-
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      scalar_type* from = (scalar_type *)&from_v[idx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp=from[w];
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-#endif
-  }
-  void CopyMatrix (void)
-  {
-    // Clone "A" to be lexicographic in the physics coords
-    // Use unvectorisetolexordarray
-    // Copy to device
-    for(int p=0;p<geom.npoint;p++){
-      //Unpadded
-      auto Aup = _Op.Cell.Extract(_Op._A[p]);
-      //      Coordinate coor({0,0,0,0,0});
-      //      auto sval = peekSite(Aup,coor);
-      //      std::cout << "CopyMatrix: p "<<p<<" Aup[0] :"<<sval<<std::endl;
-      //      sval = peekSite(_Op._A[p],coor);
-      //      std::cout << "CopyMatrix: p "<<p<<" _Op._Ap[0] :"<<sval<<std::endl;
-      GridtoBLAS(Aup,BLAS_A[p]);
-      //      std::cout << "Copy Matrix p "<<p<<" "<< deviceGet(BLAS_A[p][0])<<std::endl;
-    }
-  }
-  void Mdag(const CoarseVector &in, CoarseVector &out)
-  {
-    this->M(in,out);
-  }
-  void M (const CoarseVector &in, CoarseVector &out)
-  {
-    std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
-    conformable(CoarseGrid(),in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-
-    RealD t_tot;
-    RealD t_exch;
-    RealD t_GtoB;
-    RealD t_BtoG;
-    RealD t_mult;
-
-    t_tot=-usecond();
-    CoarseVector tin=in;
-    t_exch=-usecond();
-    CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
-    t_exch+=usecond();
-
-    CoarseVector pout(pin.Grid());
-
-    int npoint = geom.npoint;
-    typedef calcMatrix* Aview;
-    typedef LatticeView<Cvec> Vview;
-      
-    const int Nsimd = CComplex::Nsimd();
-
-    RealD flops,bytes;
-    int64_t osites=in.Grid()->oSites(); // unpadded
-    int64_t unpadded_vol = _CoarseGrid->lSites();
-    
-    flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
-    bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
-          + 2.0*osites*sizeof(siteVector)*npoint;
-    
-    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
-    assert(nrhs>=1);
-
-    std::cout << GridLogMessage << "New Mrhs GridtoBLAS in sizes "<<in.Grid()->lSites()<<" "<<pin.Grid()->lSites()<<std::endl;
-    t_GtoB=-usecond();
-    GridtoBLAS(pin,BLAS_B);
-    //    out = Zero();
-    //    GridtoBLAS(out,BLAS_C);
-    t_GtoB+=usecond();
-
-    GridBLAS BLAS;
-
-    t_mult=-usecond();
-    for(int p=0;p<geom.npoint;p++){
-      RealD c = 1.0;
-      if (p==0) c = 0.0;
-      ComplexD beta(c);
-      //      std::cout << GridLogMessage << "New Mrhs coarse gemmBatched "<<p<<std::endl;
-      BLAS.gemmBatched(nbasis,nrhs,nbasis,
-		       ComplexD(1.0),
-		       BLAS_AP[p], 
-		       BLAS_BP[p], 
-		       ComplexD(c), 
-		       BLAS_CP);
-    }
-    BLAS.synchronise();
-    t_mult+=usecond();
-    //    std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
-    t_BtoG=-usecond();
-    BLAStoGrid(out,BLAS_C);
-    t_BtoG+=usecond();
-    t_tot+=usecond();
-    //    auto check =deviceGet(BLAS_C[0]);
-    //      std::cout << "C[0] "<<check<<std::endl;
-    //    Coordinate coor({0,0,0,0,0,0});
-    //    peekLocalSite(check,out,coor);
-    //    std::cout << "C[0] "<< check<<std::endl;
-    std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult GtoB  "<<t_GtoB<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult BtoG  "<<t_BtoG<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult tot  "<<t_tot<<" us"<<std::endl;
-    std::cout << GridLogMessage<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
-  };
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
-
-};
-  
-NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/Geometry.h
+++ b/Grid/algorithms/multigrid/Geometry.h
@@ -1,238 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-
-/////////////////////////////////////////////////////////////////
-// Geometry class in cartesian case
-/////////////////////////////////////////////////////////////////
-
-class Geometry {
-public:
-  int npoint;
-  int base;
-  std::vector<int> directions   ;
-  std::vector<int> displacements;
-  std::vector<int> points_dagger;
-
-  Geometry(int _d)  {
-    
-    base = (_d==5) ? 1:0;
-
-    // make coarse grid stencil for 4d , not 5d
-    if ( _d==5 ) _d=4;
-
-    npoint = 2*_d+1;
-    directions.resize(npoint);
-    displacements.resize(npoint);
-    points_dagger.resize(npoint);
-    for(int d=0;d<_d;d++){
-      directions[d   ] = d+base;
-      directions[d+_d] = d+base;
-      displacements[d  ] = +1;
-      displacements[d+_d]= -1;
-      points_dagger[d   ] = d+_d;
-      points_dagger[d+_d] = d;
-    }
-    directions   [2*_d]=0;
-    displacements[2*_d]=0;
-    points_dagger[2*_d]=2*_d;
-  }
-
-  int point(int dir, int disp) {
-    assert(disp == -1 || disp == 0 || disp == 1);
-    assert(base+0 <= dir && dir < base+4);
-
-    // directions faster index = new indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  1  2  3  0  1  2  3  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  2  3  4  1  2  3  4  0
-    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
-
-    // displacements faster index = old indexing
-    // 4d (base = 0):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   0  0  1  1  2  2  3  3  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-    // 5d (base = 1):
-    // point 0  1  2  3  4  5  6  7  8
-    // dir   1  1  2  2  3  3  4  4  0
-    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
-
-    if(dir == 0 and disp == 0)
-      return 8;
-    else // New indexing
-      return (1 - disp) / 2 * 4 + dir - base;
-    // else // Old indexing
-    //   return (4 * (dir - base) + 1 - disp) / 2;
-  }
-};
-
-/////////////////////////////////////////////////////////////////
-// Less local equivalent of Geometry class in cartesian case
-/////////////////////////////////////////////////////////////////
-class NonLocalStencilGeometry {
-public:
-  //  int depth;
-  int skip;
-  int hops;
-  int npoint;
-  std::vector<Coordinate> shifts;
-  Coordinate stencil_size;
-  Coordinate stencil_lo;
-  Coordinate stencil_hi;
-  GridCartesian *grid;
-  GridCartesian *Grid() {return grid;};
-  int Depth(void){return 1;};   // Ghost zone depth
-  int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
-  int DimSkip(void){return skip;};
-
-  virtual ~NonLocalStencilGeometry() {};
-
-  int  Reverse(int point)
-  {
-    int Nd = Grid()->Nd();
-    Coordinate shft = shifts[point];
-    Coordinate rev(Nd);
-    for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
-    for(int p=0;p<npoint;p++){
-      if(rev==shifts[p]){
-	return p;
-      }
-    }
-    assert(0);
-    return -1;
-  }
-  void BuildShifts(void)
-  {
-    this->shifts.resize(0);
-    int Nd = this->grid->Nd();
-
-    int dd = this->DimSkip();
-    for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
-    for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
-    for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
-    for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
-      Coordinate sft(Nd,0);
-      sft[dd+0] = s0;
-      sft[dd+1] = s1;
-      sft[dd+2] = s2;
-      sft[dd+3] = s3;
-      int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
-      if(nhops<=this->hops) this->shifts.push_back(sft);
-    }}}}
-    this->npoint = this->shifts.size();
-    std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
-  }
-  
-  NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops,int _skip) : grid(_coarse_grid), hops(_hops), skip(_skip)
-  {
-    Coordinate latt = grid->GlobalDimensions();
-    stencil_size.resize(grid->Nd());
-    stencil_lo.resize(grid->Nd());
-    stencil_hi.resize(grid->Nd());
-    for(int d=0;d<grid->Nd();d++){
-     if ( latt[d] == 1 ) {
-      stencil_lo[d] = 0;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 1;
-     } else if ( latt[d] == 2 ) {
-      stencil_lo[d] = -1;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 2;
-     } else if ( latt[d] > 2 ) {
-       stencil_lo[d] = -1;
-       stencil_hi[d] =  1;
-       stencil_size[d]= 3;
-     }
-    }
-    this->BuildShifts();
-  };
-
-};
-
-// Need to worry about red-black now
-class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
-public:
-  virtual int DerivedDimSkip(void) { return 0;};
-  NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { };
-  virtual ~NonLocalStencilGeometry4D() {};
-};
-class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
-public:
-  virtual int DerivedDimSkip(void) { return 1; }; 
-  NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1)  { };
-  virtual ~NonLocalStencilGeometry5D() {};
-};
-/*
- * Bunch of different options classes
- */
-class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
-public:
-  NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,4)
-  {
-  };
-};
-class NextToNextToNextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,4)
-  {
-  };
-};
-class NextToNearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,2)
-  {
-  };
-};
-class NextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,2)
-  {
-  };
-};
-class NearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,1)
-  {
-  };
-};
-class NearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,1)
-  {
-  };
-};
-
-NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/MultiGrid.h
+++ b/Grid/algorithms/multigrid/MultiGrid.h
@@ -1,35 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: Grid/algorithms/multigrid/MultiGrid.h
-
-    Copyright (C) 2023
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#pragma once
-
-#include <Grid/algorithms/multigrid/Aggregates.h>
-#include <Grid/algorithms/multigrid/Geometry.h>
-#include <Grid/algorithms/multigrid/BatchedBlas.h>
-#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
-#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
-#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -175,56 +175,8 @@ template<class T> using cshiftAllocator = std::allocator<T>;

 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector    = std::vector<T,devAllocator<T> >;
-template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
-template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
-
-/*
-template<class T> class vecView
-{
- protected:
-  T * data;
-  uint64_t size;
-  ViewMode mode;
-  void * cpu_ptr;
- public:
-  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
-  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
-  {
-    cpu_ptr = &refer_to_me[0];
-    size = refer_to_me.size();
-    mode = _mode;
-    data =(T *) MemoryManager::ViewOpen(cpu_ptr,
-					size*sizeof(T),
-					mode,
-					AdviseDefault);
-  }
-  void ViewClose(void)
-  { // Inform the manager
-    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
-  }
-};
-
-template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
-{
-  vecView<T> ret(vec,_mode); // does the open
-  return ret;                // must be closed
-}
-
-// Little autoscope assister
-template<class View> 
-class VectorViewCloser
-{
-  View v;  // Take a copy of view and call view close when I go out of scope automatically
- public:
-  VectorViewCloser(View &_v) : v(_v) {};
-  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
-};
-
-#define autoVecView(v_v,v,mode)					\
-  auto v_v = VectorView(v,mode);				\
-  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
-*/
+template<class T> using commVector = std::vector<T,devAllocator<T> >;
+template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -4,14 +4,11 @@ NAMESPACE_BEGIN(Grid);

 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuHuge  (1)
-#define CpuSmall (2)
-#define Acc      (3)
-#define AccHuge  (4)
-#define AccSmall (5)
-#define Shared   (6)
-#define SharedHuge  (7)
-#define SharedSmall (8)
+#define CpuSmall (1)
+#define Acc      (2)
+#define AccSmall (3)
+#define Shared   (4)
+#define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
@@ -38,15 +35,12 @@ void MemoryManager::PrintBytes(void)
  
 }

-uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
-uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
-
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@@ -165,6 +159,7 @@ void MemoryManager::Init(void)

  char * str;
  int Nc;
+  int NcS;
  
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
@@ -176,16 +171,6 @@ void MemoryManager::Init(void)
    }
  }

-  str= getenv("GRID_ALLOC_NCACHE_HUGE");
-  if ( str ) {
-    Nc = atoi(str);
-    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
-      Ncache[CpuHuge]=Nc;
-      Ncache[AccHuge]=Nc;
-      Ncache[SharedHuge]=Nc;
-    }
-  }
-
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
@@ -206,9 +191,7 @@ void MemoryManager::InitMessage(void) {
  
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 #endif
  
 #ifdef GRID_UVM
@@ -240,11 +223,8 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
-  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
-  else                                     cache = type;
-
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type + small;
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
@@ -253,12 +233,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)

 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
+  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 

-  if (ncache == 0) return ptr;
-
  void * ret = NULL;
  int v = -1;

@@ -293,11 +272,8 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  int cache;
-  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
-  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
-  else                                     cache = type;
-
+  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache = type+small;
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
@@ -306,6 +282,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)

 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
+  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -35,12 +35,6 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?

 #define GRID_ALLOC_SMALL_LIMIT (4096)
-#define GRID_ALLOC_HUGE_LIMIT  (2147483648)
-
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
-#define AUDIT(a) MemoryManager::Audit(FILE_LINE)

 /*Pinning pages is costly*/
 ////////////////////////////////////////////////////////////////////////////
@@ -71,21 +65,6 @@ enum ViewMode {
  CpuWriteDiscard = 0x10 // same for now
 };

-struct MemoryStatus {
-  uint64_t     DeviceBytes;
-  uint64_t     DeviceLRUBytes;
-  uint64_t     DeviceMaxBytes;
-  uint64_t     HostToDeviceBytes;
-  uint64_t     DeviceToHostBytes;
-  uint64_t     HostToDeviceXfer;
-  uint64_t     DeviceToHostXfer;
-  uint64_t     DeviceEvictions;
-  uint64_t     DeviceDestroy;
-  uint64_t     DeviceAllocCacheBytes;
-  uint64_t     HostAllocCacheBytes;
-};
-
-
 class MemoryManager {
 private:

@@ -99,7 +78,7 @@ private:
  } AllocationCacheEntry;

  static const int NallocCacheMax=128; 
-  static const int NallocType=9;
+  static const int NallocType=6;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@@ -113,9 +92,8 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;

- public:
  static void PrintBytes(void);
-  static void Audit(std::string s);
+ public:
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
@@ -135,28 +113,7 @@ private:
  static uint64_t     DeviceToHostBytes;
  static uint64_t     HostToDeviceXfer;
  static uint64_t     DeviceToHostXfer;
-  static uint64_t     DeviceEvictions;
-  static uint64_t     DeviceDestroy;
-  
-  static uint64_t     DeviceCacheBytes();
-  static uint64_t     HostCacheBytes();
-
-  static MemoryStatus GetFootprint(void) {
-    MemoryStatus stat;
-    stat.DeviceBytes       = DeviceBytes;
-    stat.DeviceLRUBytes    = DeviceLRUBytes;
-    stat.DeviceMaxBytes    = DeviceMaxBytes;
-    stat.HostToDeviceBytes = HostToDeviceBytes;
-    stat.DeviceToHostBytes = DeviceToHostBytes;
-    stat.HostToDeviceXfer  = HostToDeviceXfer;
-    stat.DeviceToHostXfer  = DeviceToHostXfer;
-    stat.DeviceEvictions   = DeviceEvictions;
-    stat.DeviceDestroy     = DeviceDestroy;
-    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
-    stat.HostAllocCacheBytes   = HostCacheBytes();
-    return stat;
-  };
-  
+ 
 private:
 #ifndef GRID_UVM
  //////////////////////////////////////////////////////////////////////
@@ -209,12 +166,10 @@ private:
  static void     CpuViewClose(uint64_t Ptr);
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 #endif
+  static void NotifyDeletion(void * CpuPtr);

 public:
-  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
-  static void PrintAll(void);
-  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -3,13 +3,8 @@

 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-
-#define MAXLINE 512
-static char print_buffer [ MAXLINE ];
-
-#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
-//#define dprintf(...) 
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+#define dprintf(...)


 ////////////////////////////////////////////////////////////
@@ -28,8 +23,6 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
-uint64_t  MemoryManager::DeviceEvictions;
-uint64_t  MemoryManager::DeviceDestroy;

 ////////////////////////////////////
 // Priority ordering for unlocked entries
@@ -111,17 +104,15 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
-    DeviceDestroy++;
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -130,36 +121,26 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 {
  ///////////////////////////////////////////////////////////////////////////
-  // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
-  // Cannot be acclocked. If allocated must be in LRU pool.
-  //
-  // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
-  //                          and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
-  //                          but there is a weakness where CpuLock entries are attempted for erase
-  //                          Take these OUT LRU queue when CPU locked?
-  //                          Cannot take out the table as cpuLock data is important.
+  // Make CPU consistent, remove from Accelerator, remove entry
+  // Cannot be locked. If allocated must be in LRU pool.
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
-	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
-	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
-  if (AccCache.accLock!=0) return;
-  if (AccCache.cpuLock!=0) return;
+  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  assert(AccCache.accLock==0);
+  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
+  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
-    LRUremove(AccCache);
-    AccCache.AccPtr=(uint64_t)NULL;
-    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    LRUremove(AccCache);
+    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
-  //  uint64_t CpuPtr = AccCache.CpuPtr;
-  DeviceEvictions++;
-  //  EntryErase(CpuPtr);
+  uint64_t CpuPtr = AccCache.CpuPtr;
+  EntryErase(CpuPtr);
 }
 void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 {
@@ -169,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -184,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -210,7 +191,6 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@@ -222,7 +202,6 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -233,16 +212,13 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
-  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
      assert(LRU.size()>0);
-      uint64_t victim = LRU.back(); // From the LRU
+      uint64_t victim = LRU.back();
      auto AccCacheIterator = EntryLookup(victim);
      auto & AccCache = AccCacheIterator->second;
      Evict(AccCache);
-    } else {
-      return;
    }
  }
 }
@@ -265,12 +241,11 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
-	            (uint64_t)bytes,
-		    (uint64_t)AccCache.accLock);
+		    (uint64_t)bytes);
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
@@ -305,7 +280,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@@ -318,30 +292,28 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
  } else {
    assert(0);
  }

-  assert(AccCache.accLock>0);
-  // If view is opened on device must remove from LRU
+  // If view is opened on device remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
    LRUremove(AccCache);
  }

@@ -362,12 +334,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  assert(AccCache.accLock>0);

  AccCache.accLock--;
+
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
-  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -404,10 +374,9 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;

-  // CPU doesn't need to free space
-  //  if (!AccCache.AccPtr) {
-  //    EvictVictims(bytes);
-  //  }
+  if (!AccCache.AccPtr) {
+     EvictVictims(bytes);
+  }

  assert((mode==CpuRead)||(mode==CpuWrite));
  assert(AccCache.accLock==0);  // Programming error
@@ -461,28 +430,20 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 void  MemoryManager::Print(void)
 {
  PrintBytes();
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << "Memory Manager                             " << std::endl;
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << DeviceBytes   << " bytes allocated on device " << std::endl;
-  std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
-  std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device       " << std::endl;
-  std::cout << GridLogMessage << HostToDeviceXfer << " transfers        to   device " << std::endl;
-  std::cout << GridLogMessage << DeviceToHostXfer << " transfers        from device " << std::endl;
-  std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
-  std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
-  std::cout << GridLogMessage << DeviceEvictions  << " Evictions from device " << std::endl;
-  std::cout << GridLogMessage << DeviceDestroy    << " Destroyed vectors on device " << std::endl;
-  std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-}
-void  MemoryManager::PrintAll(void)
-{
-  Print();
-  std::cout << GridLogMessage << std::endl;
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
+  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
+  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
+  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
+  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
+  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
+  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
+  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
    
@@ -492,13 +453,13 @@ void  MemoryManager::PrintAll(void)
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");

-    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 	      << "\t" << AccCache.cpuLock
 	      << "\t" << AccCache.accLock
 	      << "\t" << AccCache.LRU_valid<<std::endl;
  }
-  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
+  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;

 };
 int   MemoryManager::isOpen   (void* _CpuPtr) 
@@ -512,89 +473,6 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
    return 0;
  }
 }
-void MemoryManager::Audit(std::string s)
-{
-  uint64_t CpuBytes=0;
-  uint64_t AccBytes=0;
-  uint64_t LruBytes1=0;
-  uint64_t LruBytes2=0;
-  uint64_t LruCnt=0;
-  
-  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
-  for(auto it=LRU.begin();it!=LRU.end();it++){
-    uint64_t cpuPtr = *it;
-    assert(EntryPresent(cpuPtr));
-    auto AccCacheIterator = EntryLookup(cpuPtr);
-    auto & AccCache = AccCacheIterator->second;
-    LruBytes2+=AccCache.bytes;
-    assert(AccCache.LRU_valid==1);
-    assert(AccCache.LRU_entry==it);
-  }
-  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
-
-  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
-    auto &AccCache = it->second;
-    
-    std::string str;
-    if ( AccCache.state==Empty    ) str = std::string("Empty");
-    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
-    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
-    if ( AccCache.state==Consistent)str = std::string("Consistent");
-
-    CpuBytes+=AccCache.bytes;
-    if( AccCache.AccPtr )    AccBytes+=AccCache.bytes;
-    if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
-    if( AccCache.LRU_valid ) LruCnt++;
-    
-    if ( AccCache.cpuLock || AccCache.accLock ) {
-      assert(AccCache.LRU_valid==0);
-
-      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
-		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
-		<< "\t cpuLock  " << AccCache.cpuLock
-		<< "\t accLock  " << AccCache.accLock
-		<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
-    }
-
-    assert( AccCache.cpuLock== 0 ) ;
-    assert( AccCache.accLock== 0 ) ;
-  }
-  std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
-  assert(LruBytes1==LruBytes2);
-  assert(LruBytes1==DeviceLRUBytes);
-  std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
-  assert(AccBytes==DeviceBytes);
-  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
-  assert(LruCnt == LRU.size());
-  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
-
-}
-
-void MemoryManager::PrintState(void* _CpuPtr)
-{
-  uint64_t CpuPtr = (uint64_t)_CpuPtr;
-
-  if ( EntryPresent(CpuPtr) ){
-    auto AccCacheIterator = EntryLookup(CpuPtr);
-    auto & AccCache = AccCacheIterator->second;
-    std::string str;
-    if ( AccCache.state==Empty    ) str = std::string("Empty");
-    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
-    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
-    if ( AccCache.state==Consistent)str = std::string("Consistent");
-    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
-
-    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
-    std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
-    << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
-    << "\t" << AccCache.cpuLock
-    << "\t" << AccCache.accLock
-    << "\t" << AccCache.LRU_valid<<std::endl;
-
-  } else {
-    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
-  }
-}

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -12,19 +12,11 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
-uint64_t  MemoryManager::DeviceEvictions;
-uint64_t  MemoryManager::DeviceDestroy;

-void  MemoryManager::Audit(std::string s){};
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
-void  MemoryManager::PrintState(void* CpuPtr)
-{
-std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
-};
 void  MemoryManager::Print(void){};
-void  MemoryManager::PrintAll(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};

 NAMESPACE_END(Grid);
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -70,8 +70,8 @@ public:
  Coordinate _istride;    // Inner stride i.e. within simd lane
  int _osites;                  // _isites*_osites = product(dimensions).
  int _isites;
-  int64_t _fsites;                  // _isites*_osites = product(dimensions).
-  int64_t _gsites;
+  int _fsites;                  // _isites*_osites = product(dimensions).
+  int _gsites;
  Coordinate _slice_block;// subslice information
  Coordinate _slice_stride;
  Coordinate _slice_nblock;
@@ -183,7 +183,7 @@ public:
  inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
  inline int oSites(void) const { return _osites; };
  inline int lSites(void) const { return _isites*_osites; }; 
-  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
+  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
  inline int Nd    (void) const { return _ndimension;};

  inline const Coordinate LocalStarts(void)             { return _lstart;    };
@@ -214,7 +214,7 @@ public:
  ////////////////////////////////////////////////////////////////
  // Global addressing
  ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
    assert(gidx< gSites());
    Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
  }
@@ -222,7 +222,7 @@ public:
    assert(lidx<lSites());
    Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
  }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
    gidx=0;
    int mult=1;
    for(int mu=0;mu<_ndimension;mu++) {
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -53,11 +53,10 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
-  int              _processor;       // linear processor rank
-  unsigned long    _ndimension;
-  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
  Coordinate _processor_coor;  // linear processor coordinate
+  unsigned long    _ndimension;
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@@ -98,16 +97,14 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
-  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
-  int                ProcessorCount(void)    ;
+  int                      ProcessorCount(void)    ;

  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
-  static void BarrierWorld(void);
  
  ////////////////////////////////////////////////////////////
  // Reduction
@@ -131,21 +128,13 @@ public:
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
-    scalar_type * ptr = (scalar_type *)& o; // Safe alias 
+    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
-  void CommsComplete(std::vector<CommsRequest_t> &list);
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			   void *xmit,
-			   int dest,
-			   void *recv,
-			   int from,
-			   int bytes,int dir);
-  
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
@@ -153,17 +142,17 @@ public:
 		      int bytes);
  
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,int do_xmit,
+			       int xmit_to_rank,
 			       void *recv,
-			       int recv_from_rank,int do_recv,
+			       int recv_from_rank,
 			       int bytes,int dir);

  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,int do_xmit,
+				    int xmit_to_rank,
 				    void *recv,
-				    int recv_from_rank,int do_recv,
-				    int xbytes,int rbytes,int dir);
+				    int recv_from_rank,
+				    int bytes,int dir);
  
  
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@@ -124,13 +124,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-  Coordinate shm_processors       (_ndimension,1);
+
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
-    shm_processors       [pad+d]=parent._shm_processors[d];
  }

  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -155,7 +154,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
-    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }

  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@@ -306,44 +304,6 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
-
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes,int dir)
-{
-  MPI_Request xrq;
-  MPI_Request rrq;
-
-  assert(dest != _processor);
-  assert(from != _processor);
-
-  int tag;
-
-  tag= dir+from*32;
-  int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
-  assert(ierr==0);
-  list.push_back(rrq);
-  
-  tag= dir+_processor*32;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
-  assert(ierr==0);
-  list.push_back(xrq);
-}
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
-{
-  int nreq=list.size();
-
-  if (nreq==0) return;
-
-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-  list.resize(0);
-}
-
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
@@ -375,23 +335,23 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest, int dox,
+						     int dest,
 						     void *recv,
-						     int from, int dor,
+						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,int dox,
+							 int dest,
 							 void *recv,
-							 int from,int dor,
-							 int xbytes,int rbytes,int dir)
+							 int from,
+							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
@@ -410,28 +370,30 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;

-  if ( dor ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
-      list.push_back(rrq);
-      off_node_bytes+=rbytes;
-    }
+  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+    off_node_bytes+=bytes;
  }
-  
-  if (dox) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list.push_back(xrq);
-      off_node_bytes+=xbytes;
-    } else {
-      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
-    }
+
+  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+    off_node_bytes+=bytes;
+  } else {
+    // TODO : make a OMP loop on CPU, call threaded bcopy
+    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+    assert(shm!=NULL);
+    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+    acceleratorCopySynchronise(); // MPI prob slower
+  }
+
+  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+    this->StencilSendToRecvFromComplete(list,dir);
  }

  return off_node_bytes;
@@ -473,10 +435,6 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
-void CartesianCommunicator::BarrierWorld(void){
-  int ierr = MPI_Barrier(communicator_world);
-  assert(ierr==0);
-}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -45,14 +45,12 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
-  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }

 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
-  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@@ -91,17 +89,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes,int dir)
-{
-  assert(0);
-}
-
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -115,7 +102,6 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
-void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@@ -125,21 +111,21 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }

 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,int dox,
+						     int xmit_to_rank,
 						     void *recv,
-						     int recv_from_rank,int dor,
+						     int recv_from_rank,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,int dox,
+							 int xmit_to_rank,
 							 void *recv,
-							 int recv_from_rank,int dor,
-							 int xbytes,int rbytes, int dir)
+							 int recv_from_rank,
+							 int bytes, int dir)
 {
-  return xbytes+rbytes;
+  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -91,59 +91,6 @@ void *SharedMemory::ShmBufferSelf(void)
  //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
  return ShmCommBufs[ShmRank];
 }
-static inline int divides(int a,int b)
-{
-  return ( b == ( (b/a)*a ) );
-}
-void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
-{
-  ////////////////////////////////////////////////////////////////
-  // Allow user to configure through environment variable
-  ////////////////////////////////////////////////////////////////
-  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
-  if ( str ) {
-    std::vector<int> IntShmDims;
-    GridCmdOptionIntVector(std::string(str),IntShmDims);
-    assert(IntShmDims.size() == WorldDims.size());
-    long ShmSize = 1;
-    for (int dim=0;dim<WorldDims.size();dim++) {
-      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
-      assert(divides(ShmDims[dim],WorldDims[dim]));
-    }
-    assert(ShmSize == WorldShmSize);
-    return;
-  }
-  
-  ////////////////////////////////////////////////////////////////
-  // Powers of 2,3,5 only in prime decomposition for now
-  ////////////////////////////////////////////////////////////////
-  int ndimension = WorldDims.size();
-  ShmDims=Coordinate(ndimension,1);
-
-  std::vector<int> primes({2,3,5});
-
-  int dim = 0;
-  int last_dim = ndimension - 1;
-  int AutoShmSize = 1;
-  while(AutoShmSize != WorldShmSize) {
-    int p;
-    for(p=0;p<primes.size();p++) {
-      int prime=primes[p];
-      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
-        && divides(prime,WorldShmSize/AutoShmSize)  ) {
-  AutoShmSize*=prime;
-  ShmDims[dim]*=prime;
-  last_dim = dim;
-  break;
-      }
-    }
-    if (p == primes.size() && last_dim == dim) {
-      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
-      exit(EXIT_FAILURE);
-    }
-    dim=(dim+1) %ndimension;
-  }
-}

 NAMESPACE_END(Grid); 

--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -93,10 +93,9 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
-  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -27,8 +27,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 *************************************************************************************/
 /*  END LEGAL */

-#define Mheader "SharedMemoryMpi: "
-
 #include <Grid/GridCore.h>
 #include <pwd.h>

@@ -38,120 +36,12 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCL
-#define GRID_SYCL_LEVEL_ZERO_IPC
-#include <syscall.h>
-#define SHM_SOCKETS 
-#endif
+#ifdef GRID_SYCl

-#include <sys/socket.h>
-#include <sys/un.h>
+#endif

 NAMESPACE_BEGIN(Grid); 
-
-#ifdef SHM_SOCKETS
-
-/*
- * Barbaric extra intranode communication route in case we need sockets to pass FDs
- * Forced by level_zero not being nicely designed
- */
-static int sock;
-static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
-static char sock_path[256];
-class UnixSockets {
-public:
-  static void Open(int rank)
-  {
-    int errnum;
-
-    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
-
-    struct sockaddr_un sa_un = { 0 };
-    sa_un.sun_family = AF_UNIX;
-    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
-    unlink(sa_un.sun_path);
-    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
-      perror("bind failure");
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  static int RecvFileDescriptor(void)
-  {
-    int n;
-    int fd;
-    char buf[1];
-    struct iovec iov;
-    struct msghdr msg;
-    struct cmsghdr *cmsg;
-    char cms[CMSG_SPACE(sizeof(int))];
-
-    iov.iov_base = buf;
-    iov.iov_len = 1;
-
-    memset(&msg, 0, sizeof msg);
-    msg.msg_name = 0;
-    msg.msg_namelen = 0;
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-
-    msg.msg_control = (caddr_t)cms;
-    msg.msg_controllen = sizeof cms;
-
-    if((n=recvmsg(sock, &msg, 0)) < 0) {
-      perror("recvmsg failed");
-      return -1;
-    }
-    if(n == 0){
-      perror("recvmsg returned 0");
-      return -1;
-    }
-    cmsg = CMSG_FIRSTHDR(&msg);
-
-    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
-
-    return fd;
-  }
-
-  static void SendFileDescriptor(int fildes,int xmit_to_rank)
-  {
-    struct msghdr msg;
-    struct iovec iov;
-    struct cmsghdr *cmsg = NULL;
-    char ctrl[CMSG_SPACE(sizeof(int))];
-    char data = ' ';
-
-    memset(&msg, 0, sizeof(struct msghdr));
-    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
-    iov.iov_base = &data;
-    iov.iov_len = sizeof(data);
-    
-    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
-    
-    struct sockaddr_un sa_un = { 0 };
-    sa_un.sun_family = AF_UNIX;
-    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
-
-    msg.msg_name = (void *)&sa_un;
-    msg.msg_namelen = sizeof(sa_un);
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
-    msg.msg_control = ctrl;
-
-    cmsg = CMSG_FIRSTHDR(&msg);
-    cmsg->cmsg_level = SOL_SOCKET;
-    cmsg->cmsg_type = SCM_RIGHTS;
-    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
-
-    *((int *) CMSG_DATA(cmsg)) = fildes;
-
-    sendmsg(sock, &msg, 0);
-  };
-};
-#endif
-
-
+#define header "SharedMemoryMpi: "
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
@@ -174,8 +64,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);

  if ( WorldRank == 0) {
-    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;  
-    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;
+    std::cout << header " World communicator of size " <<WorldSize << std::endl;  
+    std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl;
  }
  // WorldShmComm, WorldShmSize, WorldShmRank

@@ -262,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -275,11 +165,63 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;

-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
+static inline int divides(int a,int b)
+{
+  return ( b == ( (b/a)*a ) );
+}
+void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
+{
+  ////////////////////////////////////////////////////////////////
+  // Allow user to configure through environment variable
+  ////////////////////////////////////////////////////////////////
+  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
+  if ( str ) {
+    std::vector<int> IntShmDims;
+    GridCmdOptionIntVector(std::string(str),IntShmDims);
+    assert(IntShmDims.size() == WorldDims.size());
+    long ShmSize = 1;
+    for (int dim=0;dim<WorldDims.size();dim++) {
+      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
+      assert(divides(ShmDims[dim],WorldDims[dim]));
+    }
+    assert(ShmSize == WorldShmSize);
+    return;
+  }
+  
+  ////////////////////////////////////////////////////////////////
+  // Powers of 2,3,5 only in prime decomposition for now
+  ////////////////////////////////////////////////////////////////
+  int ndimension = WorldDims.size();
+  ShmDims=Coordinate(ndimension,1);

-void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+  std::vector<int> primes({2,3,5});
+
+  int dim = 0;
+  int last_dim = ndimension - 1;
+  int AutoShmSize = 1;
+  while(AutoShmSize != WorldShmSize) {
+    int p;
+    for(p=0;p<primes.size();p++) {
+      int prime=primes[p];
+      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
+        && divides(prime,WorldShmSize/AutoShmSize)  ) {
+	AutoShmSize*=prime;
+	ShmDims[dim]*=prime;
+	last_dim = dim;
+	break;
+      }
+    }
+    if (p == primes.size() && last_dim == dim) {
+      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    dim=(dim+1) %ndimension;
+  }
+}
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -352,8 +294,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);

  GetShmDims(WorldDims,ShmDims);
-  SHM = ShmDims;
-  
+
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -400,7 +341,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@@ -412,8 +353,6 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);

  GetShmDims(WorldDims,ShmDims);
-  SHM=ShmDims;
-
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -452,7 +391,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);

@@ -537,7 +476,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }

-  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;

  SharedMemoryZero(ShmCommBuf,bytes);
@@ -580,21 +519,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  std::cout<< "Setting up IPC"<<std::endl;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef SHM_SOCKETS
-  UnixSockets::Open(WorldShmRank);
-#endif
  for(int r=0;r<WorldShmSize;r++){

-    MPI_Barrier(WorldShmComm);
-
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
@@ -602,32 +536,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    void * thisBuf = ShmCommBuf;
    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
-    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
+    typedef struct { int fd; pid_t pid ; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
-    
+
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
+	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
-      memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
-#ifdef SHM_SOCKETS
-      for(int rr=0;rr<WorldShmSize;rr++){
-	if(rr!=r){
-	  UnixSockets::SendFileDescriptor(handle.fd,rr);
-	}
-      }
-#endif
    }
 #endif
 #ifdef GRID_CUDA
@@ -655,7 +581,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
    { 
-      MPI_Barrier(WorldShmComm);
      int ierr=MPI_Bcast(&handle,
 			 sizeof(handle),
 			 MPI_BYTE,
@@ -671,10 +596,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
-      int myfd;
-#ifdef SHM_SOCKETS
-      myfd=UnixSockets::RecvFileDescriptor();
-#else
      std::cout<<"mapping seeking remote pid/fd "
 	       <<handle.pid<<"/"
 	       <<handle.fd<<std::endl;
@@ -682,22 +603,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
-      myfd  = syscall(438,pidfd,handle.fd,0);
-      int err_t = errno;
-      if (myfd < 0) {
-        fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
-	perror("pidfd_getfd failed ");
-	assert(0);
-      }
-#endif
-      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
-      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
+      int myfd  = syscall(438,pidfd,handle.fd,0);
+
+      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
+      
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));

      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
-	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
+	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
+	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@@ -732,7 +647,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
-    MPI_Barrier(WorldShmComm);
  }

  _ShmAllocBytes=bytes;
@@ -744,7 +658,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -781,7 +695,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -791,7 +705,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -838,7 +752,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
-  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -48,10 +48,9 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }

-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
-  SHM = Coordinate(processors.size(),1);
 }

 ////////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -297,30 +297,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }

-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-
-template <typename T>
-T iDivUp(T a, T b) // Round a / b to nearest higher integer value
-{ return (a % b != 0) ? (a / b + 1) : (a / b); }
-
-template <typename T>
-__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
-{
-    int idx = blockIdx.x*blockDim.x + threadIdx.x;
-    if (idx >= e1*e2) return;
-
-    int n, b, o;
-
-    n = idx / e2;
-    b = idx % e2;
-    o = n*stride + b;
-
-    vector[2*idx + 0] = lo + o;
-    vector[2*idx + 1] = ro + o;
-}
-
-#endif
-
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -345,20 +321,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;

  if(cbmask == 0x3 ){
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
-#endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -409,19 +377,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;

  if ( cbmask == 0x3 ) {
-#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
-    ent = e1*e2;
-    dim3 blockSize(acceleratorThreads());
-    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
-    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
-    accelerator_barrier();
-#else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
-#endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
--- a/Grid/json/json.hpp
+++ b/Grid/json/json.hpp
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -47,4 +47,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
-#include <Grid/lattice/PaddedCell.h>
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
  typename std::remove_const<vobj>::type ret;

  typedef typename vobj::scalar_object scalar_object;
-  //  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  const int Nsimd = vobj::vector_type::Nsimd();
@@ -345,9 +345,7 @@ GridUnopClass(UnaryNot, Not(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
-GridUnopClass(UnarySpTa, SpTa(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
-GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
@@ -458,9 +456,7 @@ GRID_DEF_UNOP(operator!, UnaryNot);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
-GRID_DEF_UNOP(SpTa, UnarySpTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
-GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -36,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
@@ -54,7 +53,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -72,7 +70,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -89,7 +86,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -110,7 +106,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -124,7 +119,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -139,7 +133,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -153,7 +146,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
-  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -171,7 +163,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mult");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -186,7 +177,6 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("mac");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -201,7 +191,6 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("sub");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -215,7 +204,6 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
-  GRID_TRACE("add");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -230,7 +218,6 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  
 template<class sobj,class vobj> inline
 void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
-  GRID_TRACE("axpy");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -244,7 +231,6 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 }
 template<class sobj,class vobj> inline
 void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
-  GRID_TRACE("axpby");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -260,13 +246,11 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
-  GRID_TRACE("axpy_norm");
    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
-  GRID_TRACE("axpby_norm");
    return axpby_norm_fast(ret,a,b,x,y);
 }

--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -88,13 +88,6 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
-
-  // Helper function to print the state of this object in the AccCache
-  void PrintCacheState(void)
-  {
-    MemoryManager::PrintState(this->_odata);
-  }
-
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
@@ -117,7 +110,6 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
-    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -141,7 +133,6 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
-    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -165,7 +156,6 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
-    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -291,8 +281,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
-    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
+    auto him= r.View(AcceleratorRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -306,8 +296,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
-    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
+    auto him= r.View(AcceleratorRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -360,7 +350,7 @@ public:

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int64_t g=0;g<o.Grid()->_gsites;g++){
+  for(int g=0;g<o.Grid()->_gsites;g++){

    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1)
+template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
--- a/Grid/lattice/Lattice_matrix_reduction.h
+++ b/Grid/lattice/Lattice_matrix_reduction.h
@@ -32,6 +32,7 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -81,6 +82,7 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -128,6 +130,7 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  
  GridBase *FullGrid  = lhs.Grid();
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -96,6 +96,9 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){

  GridBase *grid=l.Grid();

+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
  int Nsimd = grid->Nsimd();

  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@@ -122,17 +125,14 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
-template<class vobj>
-typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
-  typename vobj::scalar_object s;
-  peekSite(s,l,site);
-  return s;
-}        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
        
  GridBase *grid=l.Grid();

+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
  int Nsimd = grid->Nsimd();

  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@@ -173,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
  
-  const vector_type *vp = (const vector_type *) &l[odx];
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
      
  for(int w=0;w<words;w++){
-    pt[w] = getlane(vp[w],idx);
+    pt[w] = vp[idx+w*Nsimd];
  }
      
  return;
@@ -210,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);

-  vector_type * vp = (vector_type *)&l[odx];
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    putlane(vp[w],pt[w],idx);
+    vp[idx+w*Nsimd] = pt[w];
  }
  return;
 };
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -28,9 +28,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
-#if defined(GRID_SYCL)
-#include <Grid/lattice/Lattice_reduction_sycl.h>
-#endif

 NAMESPACE_BEGIN(Grid);

@@ -94,7 +91,10 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  for(int i=0;i<nthread;i++){
    ssum = ssum+sumarray[i];
  } 
-  return ssum;
+  
+  typedef typename vobj::scalar_object ssobj;
+  ssobj ret = ssum;
+  return ret;
 }
 /*
 Threaded max, don't use for now
@@ -127,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
@@ -136,61 +136,25 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
-template<class vobj>
-inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
-{
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
-  return sumD_gpu_large(arg,osites);
-#else
-  return sumD_cpu(arg,osites);
-#endif  
-}
-
-template<class vobj>
-inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
-{
-  Integer osites = arg.Grid()->oSites();
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
-  autoView( arg_v, arg, AcceleratorRead);
-  return sum_gpu(&arg_v[0],osites);
-#else
-  autoView(arg_v, arg, CpuRead);
-  return sum_cpu(&arg_v[0],osites);
-#endif  
-}

 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-  auto ssum = rankSum(arg);
-  arg.Grid()->GlobalSum(ssum);
-  return ssum;
-}
-
-template<class vobj>
-inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
-{
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_gpu_large(&arg_v[0],osites);
+  auto ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
-  return sum_cpu(&arg_v[0],osites);
-#endif
-}
-
-template<class vobj>
-inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
-{
-  auto ssum = rankSumLarge(arg);
+  auto ssum= sum_cpu(&arg_v[0],osites);
+#endif  
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
@@ -203,27 +167,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

-
-template<class Op,class T1>
-inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr)  ->RealD
-{
-  return norm2(closure(expr));
-}
-
-template<class Op,class T1,class T2>
-inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr)      ->RealD
-{
-  return norm2(closure(expr));
-}
-
-
-template<class Op,class T1,class T2,class T3>
-inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)      ->RealD
-{
-  return norm2(closure(expr));
-}
-
-
 //The global maximum of the site norm2
 template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 {
@@ -254,6 +197,7 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  ComplexD  nrm;
  
@@ -263,40 +207,24 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  
  // Might make all code paths go this way.
-#if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-  {
-    autoView( left_v , left, AcceleratorRead);
-    autoView( right_v,right, AcceleratorRead);
-    // This code could read coalesce
-    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
-    });
-  }
-#else
-  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
-  Vector<inner_t> inner_tmp(sites);
-  auto inner_tmp_v = &inner_tmp[0];
    
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);

    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
+    accelerator_for( ss, sites, 1,{
+	auto x_l = left_v[ss];
+	auto y_l = right_v[ss];
+	inner_tmp_v[ss]=innerProductD(x_l,y_l);
    });
  }
-#endif
+
  // This is in single precision and fails some tests
-  auto anrm = sumD(inner_tmp_v,sites);  
+  auto anrm = sum(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@@ -329,7 +257,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(z,x);
  conformable(x,y);

-  //  typedef typename vobj::vector_typeD vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  
  GridBase *grid = x.Grid();
@@ -341,29 +270,17 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-#if 0
+
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];

-  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
-      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
-      coalescedWrite(z_v[ss],tmp);
+  accelerator_for( ss, sites, 1,{
+      auto tmp = a*x_v[ss]+b*y_v[ss];
+      inner_tmp_v[ss]=innerProductD(tmp,tmp);
+      z_v[ss]=tmp;
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
-#else
-  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
-  Vector<inner_t> inner_tmp(sites);
-  auto inner_tmp_v = &inner_tmp[0];
-
-  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
-      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
-      coalescedWrite(z_v[ss],tmp);
-  });
-  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
-#endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@@ -373,6 +290,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 {
  conformable(left,right);

+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  Vector<ComplexD> tmp(2);

@@ -516,14 +434,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
-template<class vobj> inline
-std::vector<typename vobj::scalar_object> 
-sliceSum(const Lattice<vobj> &Data,int orthogdim)
-{
-  std::vector<typename vobj::scalar_object> result;
-  sliceSum(Data,result,orthogdim);
-  return result;
-}

 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
@@ -628,8 +538,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
 template<class vobj>
 static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			    int orthogdim,RealD scale=1.0) 
-{
-  // perhaps easier to just promote A to a field and use regular madd
+{    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -660,7 +569,8 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
    for(int l=0;l<Nsimd;l++){
      grid->iCoorFromIindex(icoor,l);
      int ldx =r+icoor[orthogdim]*rd;
-      av.putlane(scalar_type(a[ldx])*zscale,l);
+      scalar_type *as =(scalar_type *)&av;
+      as[l] = scalar_type(a[ldx])*zscale;
    }

    tensor_reduced at; at=av;
@@ -700,6 +610,7 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -753,6 +664,7 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -806,6 +718,7 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  
  GridBase *FullGrid  = lhs.Grid();
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -23,27 +23,28 @@ unsigned int nextPow2(Iterator x) {
 }

 template <class Iterator>
-int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  
  int device;
 #ifdef GRID_CUDA
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  auto discard=hipGetDevice(&device);
+  hipGetDevice(&device);
 #endif
  
  Iterator warpSize            = gpu_props[device].warpSize;
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  /*  
+  
  std::cout << GridLogDebug << "GPU has:" << std::endl;
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
+  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  */  
+  
  if (warpSize != WARP_SIZE) {
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
    exit(EXIT_FAILURE);
@@ -51,14 +52,10 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
-  if ( threads*sizeofsobj > sharedMemPerBlock ) {
-    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
-    return 0;
-  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
-  return 1;
+  
 }

 template <class sobj, class Iterator>
@@ -198,7 +195,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_objectD sobj;
  typedef decltype(lat) Iterator;
@@ -207,77 +204,17 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  Integer size = osites*nsimd;

  Integer numThreads, numBlocks;
-  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
-  assert(ok);
-
+  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  Integer smemSize = numThreads * sizeof(sobj);
-  // Move out of UVM
-  // Turns out I had messed up the synchronise after move to compute stream
-  // as running this on the default stream fools the synchronise
-#undef UVM_BLOCK_BUFFER  
-#ifndef UVM_BLOCK_BUFFER  
-  commVector<sobj> buffer(numBlocks);
-  sobj *buffer_v = &buffer[0];
-  sobj result;
-  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
-  accelerator_barrier();
-  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
-#else
+
  Vector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
-  sobj result;
-  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
+  
+  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
  accelerator_barrier();
-  result = *buffer_v;
-#endif
+  auto result = buffer_v[0];
  return result;
 }
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::vector_type  vector;
-  typedef typename vobj::scalar_typeD scalarD;
-  typedef typename vobj::scalar_objectD sobj;
-  sobj ret;
-  scalarD *ret_p = (scalarD *)&ret;
-  
-  const int words = sizeof(vobj)/sizeof(vector);
-
-  Vector<vector> buffer(osites);
-  vector *dat = (vector *)lat;
-  vector *buf = &buffer[0];
-  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
-  for(int w=0;w<words;w++) {
-
-    accelerator_for(ss,osites,1,{
-	buf[ss] = dat[ss*words+w];
-      });
-      
-    ret_p[w] = sumD_gpu_small(tbuf,osites);
-  }
-  return ret;
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::scalar_objectD sobj;
-  sobj ret;
-  
-  Integer nsimd= vobj::Nsimd();
-  Integer size = osites*nsimd;
-  Integer numThreads, numBlocks;
-  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
-  
-  if ( ok ) {
-    ret = sumD_gpu_small(lat,osites);
-  } else {
-    ret = sumD_gpu_large(lat,osites);
-  }
-  return ret;
-}
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -290,13 +227,6 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
  return result;
 }

-template <class vobj>
-inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu_large(lat,osites);
-  return result;
-}
+

 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -1,125 +0,0 @@
-NAMESPACE_BEGIN(Grid);
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Possibly promote to double and sum
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
-{
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_objectD sobjD;
-  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
-  sobj identity; zeroit(identity);
-  sobj ret ; 
-
-  Integer nsimd= vobj::Nsimd();
-  
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
-     cgh.parallel_for(cl::sycl::range<1>{osites},
-		      Reduction,
-		      [=] (cl::sycl::id<1> item, auto &sum) {
-      auto osite   = item[0];
-      sum +=Reduce(lat[osite]);
-     });
-   });
-  theGridAccelerator->wait();
-  ret = mysum[0];
-  free(mysum,*theGridAccelerator);
-  sobjD dret; convertType(dret,ret);
-  return dret;
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_tensor(lat,osites);
-}
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_large(lat,osites);
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_large(lat,osites);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Return as same precision as input performing reduction in double precision though
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-template <class vobj>
-inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu(lat,osites);
-  return result;
-}
-
-template <class vobj>
-inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu_large(lat,osites);
-  return result;
-}
-
-NAMESPACE_END(Grid);
-
-/*
-template<class Double> Double svm_reduce(Double *vec,uint64_t L)
-{
-  Double sumResult; zeroit(sumResult);
-  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
-  Double identity;  zeroit(identity);
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
-     cgh.parallel_for(cl::sycl::range<1>{L},
-		      Reduction,
-		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum +=vec[index];
-     });
-   });
-  theGridAccelerator->wait();
-  Double ret = d_sum[0];
-  free(d_sum,*theGridAccelerator);
-  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
-  return ret;
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::vector_type  vector;
-  typedef typename vobj::scalar_type  scalar;
-
-  typedef typename vobj::scalar_typeD scalarD;
-  typedef typename vobj::scalar_objectD sobjD;
-
-  sobjD ret;
-  scalarD *ret_p = (scalarD *)&ret;
-  
-  const int nsimd = vobj::Nsimd();
-  const int words = sizeof(vobj)/sizeof(vector);
-
-  Vector<scalar> buffer(osites*nsimd);
-  scalar *buf = &buffer[0];
-  vector *dat = (vector *)lat;
-
-  for(int w=0;w<words;w++) {
-
-    accelerator_for(ss,osites,nsimd,{
-	int lane = acceleratorSIMTlane(nsimd);
-	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
-    });
-    //Precision change at this point is to late to gain precision
-    ret_p[w] = svm_reduce(buf,nsimd*osites);
-  }
-  return ret;
-}
-*/
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -32,8 +32,9 @@
 #include <random>

 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
+#include <Grid/random/gaussian.h>

 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -142,7 +143,7 @@ public:

  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
+  std::vector<Grid::gaussian_distribution<RealD> >    _gaussian;
  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;

@@ -243,7 +244,7 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }
@@ -357,18 +358,13 @@ public:

    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
-  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
-  {
-    if ( l.Grid()->_isCheckerBoarded ) {
-      Lattice<vobj> tmp(_grid);
-      fill(tmp,dist);
-      pickCheckerboard(l.Checkerboard(),l,tmp);
-      return;
-    }
+
+  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
+
    typedef typename vobj::scalar_object scalar_object;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
@@ -429,33 +425,9 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
-#if 1
-    thread_for( lidx, _grid->lSites(), {

-	int64_t gidx;
-	int o_idx;
-	int i_idx;
-	int rank;
-	Coordinate pcoor;
-	Coordinate lcoor;
-	Coordinate gcoor;
-	_grid->LocalIndexToLocalCoor(lidx,lcoor);
-	pcoor=_grid->ThisProcessorCoor();
-	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
-	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
-
-	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
-
-	assert(rank == _grid->ThisRank() );
-	
-	int l_idx=generator_idx(o_idx,i_idx);
-	_generators[l_idx] = master_engine;
-	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
-    });
-#else
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
-
 	// Where is it?
 	int rank;
 	int o_idx;
@@ -472,7 +444,6 @@ public:
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
-#endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
--- a/Grid/lattice/Lattice_trace.h
+++ b/Grid/lattice/Lattice_trace.h
@@ -66,65 +66,6 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
  return ret;
 };

-template<int N, class Vec>
-Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
-  typedef typename Vec::scalar_type scalar;
-  autoView(Umu_v,Umu,CpuRead);
-  autoView(ret_v,ret,CpuWrite);
-  thread_for(site,lvol,{
-    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
-    Coordinate lcoor;
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<scalar, N> > > Us;
-    peekLocalSite(Us, Umu_v, lcoor);
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	scalar tmp= Us()()(i,j);
-	ComplexD ztmp(real(tmp),imag(tmp));
-	EigenU(i,j)=ztmp;
-      }}
-    ComplexD detD  = EigenU.determinant();
-    typename Vec::scalar_type det(detD.real(),detD.imag());
-    pokeLocalSite(det,ret_v,lcoor);
-  });
-  return ret;
-}
-
-template<int N>
-Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
-{
-  GridBase *grid=Umu.Grid();
-  auto lvol = grid->lSites();
-  Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
-  
-  autoView(Umu_v,Umu,CpuRead);
-  autoView(ret_v,ret,CpuWrite);
-  thread_for(site,lvol,{
-    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
-    Coordinate lcoor;
-    grid->LocalIndexToLocalCoor(site, lcoor);
-    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
-    iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
-    peekLocalSite(Us, Umu_v, lcoor);
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	EigenU(i,j) = Us()()(i,j);
-      }}
-    Eigen::MatrixXcd EigenUinv = EigenU.inverse();
-    for(int i=0;i<N;i++){
-      for(int j=0;j<N;j++){
-	Ui()()(i,j) = EigenUinv(i,j);
-      }}
-    pokeLocalSite(Ui,ret_v,lcoor);
-  });
-  return ret;
-}
-
-
 NAMESPACE_END(Grid);
 #endif

--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -85,76 +85,6 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }

-template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
-{
-  half.Checkerboard() = cb;
-  autoView(half_v, half, AcceleratorWrite);
-  autoView(full_v, full, AcceleratorRead);
-  Coordinate rdim_full             = full.Grid()->_rdimensions;
-  Coordinate rdim_half             = half.Grid()->_rdimensions;
-  unsigned long ndim_half          = half.Grid()->_ndimension;
-  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
-  Coordinate ostride_half          = half.Grid()->_ostride;
-  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
-    
-    Coordinate coor;
-    int cbos;
-    int linear=0;
-
-    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
-    assert(coor.size()==ndim_half);
-
-    for(int d=0;d<ndim_half;d++){ 
-      if(checker_dim_mask_half[d]) linear += coor[d];
-    }
-    cbos = (linear&0x1);
-
-    if (cbos==cb) {
-      int ssh=0;
-      for(int d=0;d<ndim_half;d++) {
-        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
-        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
-      }
-      coalescedWrite(half_v[ssh],full_v(ss));
-    }
-  });
-}
-template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
-{
-  int cb = half.Checkerboard();
-  autoView(half_v , half, AcceleratorRead);
-  autoView(full_v , full, AcceleratorWrite);
-  Coordinate rdim_full             = full.Grid()->_rdimensions;
-  Coordinate rdim_half             = half.Grid()->_rdimensions;
-  unsigned long ndim_half          = half.Grid()->_ndimension;
-  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
-  Coordinate ostride_half          = half.Grid()->_ostride;
-  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
-
-    Coordinate coor;
-    int cbos;
-    int linear=0;
-  
-    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
-    assert(coor.size()==ndim_half);
-
-    for(int d=0;d<ndim_half;d++){ 
-      if(checker_dim_mask_half[d]) linear += coor[d];
-    }
-    cbos = (linear&0x1);
-
-    if (cbos==cb) {
-      int ssh=0;
-      for(int d=0;d<ndim_half;d++){
-        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
-        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
-      }
-      coalescedWrite(full_v[ss],half_v(ssh));
-    }
-
-  });
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
@@ -194,11 +124,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
 #endif

 accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
-  precisionChange(out,in);
+  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
 }

 accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
-  precisionChange(out,in);
+  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
 }

 template<typename T1,typename T2>
@@ -265,8 +195,8 @@ inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
-			 const             Lattice<vobj>   &fineData,
-			 const VLattice &Basis)
+			   const             Lattice<vobj>   &fineData,
+			   const VLattice &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
@@ -276,96 +206,20 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,

  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( ip_         , ip,         AcceleratorWrite);
-  RealD t_IP=0;
-  RealD t_co=0;
-  RealD t_za=0;
  for(int v=0;v<nbasis;v++) {
-    t_IP-=usecond();
    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
-    t_IP+=usecond();
-    t_co-=usecond();
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
 	convertType(coarseData_[sc](v),ip_[sc]);
    });
-    t_co+=usecond();

    // improve numerical stability of projection
    // |fine> = |fine> - <basis|fine> |basis>
    ip=-ip;
-    t_za-=usecond();
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
-    t_za+=usecond();
-  }
-  //  std::cout << GridLogPerformance << " blockProject : blockInnerProduct :  "<<t_IP<<" us"<<std::endl;
-  //  std::cout << GridLogPerformance << " blockProject : conv              :  "<<t_co<<" us"<<std::endl;
-  //  std::cout << GridLogPerformance << " blockProject : blockZaxpy        :  "<<t_za<<" us"<<std::endl;
-}
-
-
-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
-			     const             Lattice<vobj>   &fineData,
-			     const VLattice &Basis)
-{
-  GridBase * fine  = fineData.Grid();
-  GridBase * coarse= coarseData.Grid();
-
-  Lattice<iScalar<CComplex>> ip(coarse);
-  Lattice<vobj>     fineDataRed = fineData;
-
-  autoView( coarseData_ , coarseData, AcceleratorWrite);
-  autoView( ip_         , ip,         AcceleratorWrite);
-  RealD t_IP=0;
-  RealD t_co=0;
-  for(int v=0;v<nbasis;v++) {
-    t_IP-=usecond();
-    blockInnerProductD(ip,Basis[v],fineData); // ip = <basis|fine>
-    t_IP+=usecond();
-    t_co-=usecond();
-    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-	convertType(coarseData_[sc](v),ip_[sc]);
-    });
-    t_co+=usecond();
-  }
-  //  std::cout << GridLogPerformance << " blockProjectFast : blockInnerProduct :  "<<t_IP<<" us"<<std::endl;
-  //  std::cout << GridLogPerformance << " blockProjectFast : conv              :  "<<t_co<<" us"<<std::endl;
-}
-
-
-// This only minimises data motion from CPU to GPU
-// there is chance of better implementation that does a vxk loop of inner products to data share
-// at the GPU thread level
-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
-                               const std::vector<Lattice<vobj>> &fineData,
-                               const VLattice &Basis)
-{
-  int NBatch = fineData.size();
-  assert(coarseData.size() == NBatch);
-
-  GridBase * fine  = fineData[0].Grid();
-  GridBase * coarse= coarseData[0].Grid();
-
-  Lattice<iScalar<CComplex>> ip(coarse);
-  std::vector<Lattice<vobj>> fineDataCopy = fineData;
-
-  autoView(ip_, ip, AcceleratorWrite);
-  for(int v=0;v<nbasis;v++) {
-    for (int k=0; k<NBatch; k++) {
-      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
-      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
-      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-        convertType(coarseData_[sc](v),ip_[sc]);
-      });
-
-      // improve numerical stability of projection
-      // |fine> = |fine> - <basis|fine> |basis>
-      ip=-ip;
-      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
-    }
  }
 }

+
 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
 			 const Lattice<CComplex> &coarseA,
@@ -440,15 +294,8 @@ template<class vobj,class CComplex>
  Lattice<dotp> coarse_inner(coarse);

  // Precision promotion
-  RealD t;
-  t=-usecond();
  fine_inner = localInnerProductD<vobj>(fineX,fineY);
-  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
-  
-  t=-usecond();
  blockSum(coarse_inner,fine_inner);
-  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
-  t=-usecond();
  {
    autoView( CoarseInner_  , CoarseInner,AcceleratorWrite);
    autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
@@ -456,7 +303,6 @@ template<class vobj,class CComplex>
      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
    });
  }
-  //  t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
 
 }

@@ -499,9 +345,6 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 template<class vobj>
 inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) 
 {
-  const int maxsubsec=256;
-  typedef iVector<vobj,maxsubsec> vSubsec;
-
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();

@@ -521,62 +364,37 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( fineData_   , fineData, AcceleratorRead);

-  auto coarseData_p  = &coarseData_[0];
-  auto fineData_p    = &fineData_[0];
+  auto coarseData_p = &coarseData_[0];
+  auto fineData_p = &fineData_[0];
  
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;

  vobj zz = Zero();
-
-  // Somewhat lazy calculation
-  // Find the biggest power of two subsection divisor less than or equal to maxsubsec
-  int subsec=maxsubsec;
-  int subvol;
-  subvol=blockVol/subsec;
-  while(subvol*subsec!=blockVol){
-    subsec = subsec/2;
-    subvol=blockVol/subsec;
-  };
-
-  Lattice<vSubsec> coarseTmp(coarse);
-  autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
-  auto coarseTmp_p= &coarseTmp_[0];
  
-  // Sum within subsecs in a first kernel
-  accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
+  accelerator_for(sc,coarse->oSites(),1,{

-      int sc=sce/subsec;
-      int e=sce%subsec;
-      
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      auto cd = coalescedRead(zz);
-      for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
+      vobj cd = zz;
+      
+      for(int sb=0;sb<blockVol;sb++){
+
 	int sf;
 	Coordinate coor_b(_ndimension);
 	Coordinate coor_f(_ndimension);
 	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
-	
-	cd=cd+coalescedRead(fineData_p[sf]);
+
+	cd=cd+fineData_p[sf];
      }

-      coalescedWrite(coarseTmp_[sc](e),cd);
+      coarseData_p[sc] = cd;

    });
-   // Sum across subsecs in a second kernel
-   accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
-      auto cd = coalescedRead(coarseTmp_p[sc](0));
-      for(int e=1;e<subsec;e++){
-	cd=cd+coalescedRead(coarseTmp_p[sc](e));
-      }
-      coalescedWrite(coarseData_p[sc],cd);
-   });
-
  return;
 }

@@ -633,7 +451,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  blockOrthonormalize(ip,Basis);
 }

-#ifdef GRID_ACCELERATED
+#if 0
 // TODO: CPU optimized version here
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
@@ -659,37 +477,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  autoView( fineData_   , fineData, AcceleratorWrite);
  autoView( coarseData_ , coarseData, AcceleratorRead);

-  typedef LatticeView<vobj> Vview;
-  std::vector<Vview> AcceleratorVecViewContainer_h; 
-  for(int v=0;v<nbasis;v++) {
-    AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
-  }
-  static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis); 
-  acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
-  auto Basis_p = &AcceleratorVecViewContainer[0];
  // Loop with a cache friendly loop ordering
-  Coordinate frdimensions=fine->_rdimensions;
-  Coordinate crdimensions=coarse->_rdimensions;
-  accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
+  accelerator_for(sf,fine->oSites(),1,{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);

-    Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

-    auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
-    for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
-    coalescedWrite(fineData_[sf],sum);
+    for(int i=0;i<nbasis;i++) {
+      /*      auto basis_ = Basis[i],  );*/
+      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
+      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
+    }
  });
-  for(int v=0;v<nbasis;v++) {
-    AcceleratorVecViewContainer_h[v].ViewClose();
-  }
  return;
+  
 }
 #else
-// CPU version
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -713,26 +520,6 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif

-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
-                               std::vector<Lattice<vobj>> &fineData,
-                               const VLattice &Basis)
-{
-  int NBatch = coarseData.size();
-  assert(fineData.size() == NBatch);
-
-  GridBase * fine   = fineData[0].Grid();
-  GridBase * coarse = coarseData[0].Grid();
-  for (int k=0; k<NBatch; k++)
-    fineData[k]=Zero();
-  for (int i=0;i<nbasis;i++) {
-    for (int k=0; k<NBatch; k++) {
-      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
-      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
-    }
-  }
-}
-
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
@@ -776,9 +563,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // the checks should guarantee that the operations are local
-  ////////////////////////////////////////////////////////////////////////////////////////////////
+  static const int words=sizeof(vobj)/sizeof(vector_type);
+
  GridBase *Fg = From.Grid();
  GridBase *Tg = To.Grid();
  assert(!Fg->_isCheckerBoarded);
@@ -792,48 +578,42 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
-  size_t nsite = 1;
-  for(int i=0;i<nd;i++) nsite *= RegionSize[i];

-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  // do the index calc on the GPU
-  ////////////////////////////////////////////////////////////////////////////////////////////////
-  Coordinate f_ostride = Fg->_ostride;
-  Coordinate f_istride = Fg->_istride;
-  Coordinate f_rdimensions = Fg->_rdimensions;
-  Coordinate t_ostride = Tg->_ostride;
-  Coordinate t_istride = Tg->_istride;
-  Coordinate t_rdimensions = Tg->_rdimensions;
+  // the above should guarantee that the operations are local
+  Coordinate ldf = Fg->_ldimensions;
+  Coordinate rdf = Fg->_rdimensions;
+  Coordinate isf = Fg->_istride;
+  Coordinate osf = Fg->_ostride;
+  Coordinate rdt = Tg->_rdimensions;
+  Coordinate ist = Tg->_istride;
+  Coordinate ost = Tg->_ostride;

-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(from_v,From,AcceleratorRead);
-  autoView(to_v,To,AcceleratorWrite);
-
-  const int words=sizeof(vobj)/sizeof(vector_type);
-  accelerator_for(idx,nsite,1,{
-      
-      Coordinate from_coor, to_coor, base;
-      Lexicographic::CoorFromIndex(base,idx,RegionSize);
-      for(int i=0;i<nd;i++){
-	from_coor[i] = base[i] + FromLowerLeft[i];
-	to_coor[i] = base[i] + ToLowerLeft[i];
+  autoView( t_v , To, AcceleratorWrite);
+  autoView( f_v , From, AcceleratorRead);
+  accelerator_for(idx,Fg->lSites(),1,{
+    sobj s;
+    Coordinate Fcoor(nd);
+    Coordinate Tcoor(nd);
+    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
+    int in_region=1;
+    for(int d=0;d<nd;d++){
+      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
+	in_region=0;
      }
-      int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
-      int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
-      int to_oidx   = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
-      int to_lane   = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
-
-      const vector_type* from = (const vector_type *)&from_v[from_oidx];
-      vector_type* to = (vector_type *)&to_v[to_oidx];
-      
-      scalar_type stmp;
+      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
+    }
+    if (in_region) {
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
+      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      scalar_type * tp = (scalar_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
+	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
      }
-    });
+    }
+  });
 }


@@ -925,9 +705,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic

 }

-//FIXME: make this run entirely on GPU
-//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
-//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
+
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@@ -944,70 +722,11 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int

  for(int d=0;d<nh;d++){
    if ( d!=orthog ) {
-      assert(lg->_processors[d]  == hg->_processors[d]);
-      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
-    }
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
  }

-#if 1
-  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
-  size_t tbytes = 4*nsite*sizeof(int);
-  int *table = (int*)malloc(tbytes);
-  
-  thread_for(idx,nsite,{
-    Coordinate lcoor(nl);
-    Coordinate hcoor(nh);
-    lcoor[orthog] = slice_lo;
-    hcoor[orthog] = slice_hi;
-    size_t rem = idx;
-    for(int mu=0;mu<nl;mu++){
-      if(mu != orthog){
-	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
-	lcoor[mu] = hcoor[mu] = xmu;
-      }
-    }
-    int loidx = lg->oIndex(lcoor);
-    int liidx = lg->iIndex(lcoor);
-    int hoidx = hg->oIndex(hcoor);
-    int hiidx = hg->iIndex(hcoor);
-    int* tt = table + 4*idx;
-    tt[0] = loidx;
-    tt[1] = liidx;
-    tt[2] = hoidx;
-    tt[3] = hiidx;
-    });
-   
-  int* table_d = (int*)acceleratorAllocDevice(tbytes);
-  acceleratorCopyToDevice(table,table_d,tbytes);
-
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  autoView(lowDim_v,lowDim,AcceleratorRead);
-  autoView(higherDim_v,higherDim,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
-      int* tt = table_d + 4*idx;
-      int from_oidx = *tt++;
-      int from_lane = *tt++;
-      int to_oidx = *tt++;
-      int to_lane = *tt;
-
-      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
-      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
-      
-      scalar_type stmp;
-      for(int w=0;w<words;w++){
-	stmp = getlane(from[w], from_lane);
-	putlane(to[w], stmp, to_lane);
-      }
-    });
-  
-  acceleratorFreeDevice(table_d);    
-  free(table);
-  
-#else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@@ -1023,7 +742,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
-#endif
 }


@@ -1088,7 +806,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)

  Coordinate fcoor(nd);
  Coordinate ccoor(nd);
-  for(int64_t g=0;g<fg->gSites();g++){
+  for(int g=0;g<fg->gSites();g++){

    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
@@ -1292,80 +1010,11 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }

-//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
-template<class VobjOut, class VobjIn>
-void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-{
-  typedef typename VobjOut::vector_type Vout;
-  typedef typename VobjIn::vector_type Vin;
-  const int N = sizeof(VobjOut)/sizeof(Vout);
-  conformable(out.Grid(),in.Grid());
-  out.Checkerboard() = in.Checkerboard();
-  int nsimd = out.Grid()->Nsimd();
-  autoView( out_v  , out, AcceleratorWrite);
-  autoView(  in_v ,   in, AcceleratorRead);
-  accelerator_for(idx,out.Grid()->oSites(),1,{
-      Vout *vout = (Vout *)&out_v[idx];
-      Vin  *vin  = (Vin  *)&in_v[idx];
-      precisionChange(vout,vin,N);
-  });
-}
-//Convert a Lattice from one precision to another (original, slow implementation)
-template<class VobjOut, class VobjIn>
-void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-{
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
-  for(int d=0;d<out.Grid()->Nd();d++){
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
-  }
-  out.Checkerboard() = in.Checkerboard();
-  GridBase *in_grid=in.Grid();
-  GridBase *out_grid = out.Grid();
-
-  typedef typename VobjOut::scalar_object SobjOut;
-  typedef typename VobjIn::scalar_object SobjIn;
-
-  int ndim = out.Grid()->Nd();
-  int out_nsimd = out_grid->Nsimd();
-  int in_nsimd = in_grid->Nsimd();
-  std::vector<Coordinate > out_icoor(out_nsimd);
-      
-  for(int lane=0; lane < out_nsimd; lane++){
-    out_icoor[lane].resize(ndim);
-    out_grid->iCoorFromIindex(out_icoor[lane], lane);
-  }
-        
-  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
-  unvectorizeToLexOrdArray(in_slex_conv, in);
-    
-  autoView( out_v , out, CpuWrite);
-  thread_for(out_oidx,out_grid->oSites(),{
-    Coordinate out_ocoor(ndim);
-    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
-
-    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
-
-    Coordinate lcoor(out_grid->Nd());
-      
-    for(int lane=0; lane < out_nsimd; lane++){
-      for(int mu=0;mu<ndim;mu++)
-	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
-	
-      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
-      ptrs[lane] = &in_slex_conv[llex];
-    }
-    merge(out_v[out_oidx], ptrs, 0);
-  });
-}
-
 //The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
 class precisionChangeWorkspace{
  std::pair<Integer,Integer>* fmap_device; //device pointer
-  //maintain grids for checking
-  GridBase* _out_grid;
-  GridBase* _in_grid;
 public:
-  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
    assert(out_grid->Nd() == in_grid->Nd());
    for(int d=0;d<out_grid->Nd();d++){
@@ -1412,46 +1061,20 @@ public:
  
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }

-  void checkGrids(GridBase* out, GridBase* in) const{
-    conformable(out, _out_grid);
-    conformable(in, _in_grid);
-  }
-  
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };


-//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
-//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
-template<class VobjOut, class VobjIn>
-auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
-  if(out.Grid() == in.Grid()){
-    precisionChangeFast(out,in);
-    return 1;
-  }else{
-    return 0;
-  }
-}
-template<class VobjOut, class VobjIn>
-int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
-  return 0;
-}
-
-
-//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
-//which contains the mapping data.
+//Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
-  if(_precisionChangeFastWrap(out,in,0)) return;
-  
-  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same

  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();

-  workspace.checkGrids(out.Grid(),in.Grid());
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();

  //Do the copy/precision change
@@ -1468,18 +1091,15 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const pre
    });
 }

-//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
-//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
+//Convert a Lattice from one precision to another
+//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
-  if(_precisionChangeFastWrap(out,in,0)) return;   
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }


-
-
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
@@ -1774,32 +1394,5 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
  }
 }

-//////////////////////////////////////////////////////
-// MultiRHS interface support for coarse space
-// -- Simplest possible implementation to begin with
-//////////////////////////////////////////////////////
-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void blockProjectMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
-			     Lattice<iVector<CComplex,nbasis > > &coarseTMP,
-			     const VLattice &fineData, // Basis and fineData necessarily same type
-			     const VLattice &Basis)
-{
-  for(int r=0;r<fineData.size();r++){
-    blockProject(coarseTMP,fineData[r],Basis);
-    InsertSliceLocal(coarseTMP, coarseIP,r,r,0);
-  }
-}
-template<class vobj,class CComplex,int nbasis,class VLattice>
-inline void blockPromoteMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
-			     Lattice<iVector<CComplex,nbasis > > &coarseTMP,
-			     const VLattice &fineData, // Basis and fineData necessarily same type
-			     const VLattice &Basis)
-{
-  for(int r=0;r<fineData.size();r++){
-    ExtractSliceLocal(coarseTMP, coarseIP,r,r,0);
-    blockPromote(coarseTMP,fineData[r],Basis);
-  }
-}
-
 NAMESPACE_END(Grid);

--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -1,571 +0,0 @@
-/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/lattice/PaddedCell.h
-
-    Copyright (C) 2019
-
-Author: Peter Boyle pboyle@bnl.gov
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include<Grid/cshift/Cshift.h>
-
-NAMESPACE_BEGIN(Grid);
-
-//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
-template<typename vobj>
-struct CshiftImplBase{
-  virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
-  virtual ~CshiftImplBase(){}
-};
-template<typename vobj>
-struct CshiftImplDefault: public CshiftImplBase<vobj>{
-  Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
-};
-template<typename Gimpl>
-struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
-  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
-};  
-
-
-/*
- *
- * TODO: 
- *  -- address elementsof vobj via thread block in Scatter/Gather
- *  -- overlap comms with motion in Face_exchange
- *
- */
-
-template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
-					      Lattice<vobj> &lat,
-					      int x,
-					      int dim,
-					      int offset=0)
-{
-  const int Nsimd=vobj::Nsimd();
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  GridBase *grid = lat.Grid();
-  Coordinate simd = grid->_simd_layout;
-  int Nd          = grid->Nd();
-  int block       = grid->_slice_block[dim];
-  int stride      = grid->_slice_stride[dim];
-  int nblock      = grid->_slice_nblock[dim];
-  int rd          = grid->_rdimensions[dim];
-
-  int ox = x%rd;
-  int ix = x/rd;
-
-  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
-
-  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
-
-  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
-  int rNsimda= Nsimd/simd[dim]; // should be equal
-  assert(rNsimda==rNsimd);
-  int face_ovol=block*nblock;
-
-  //  assert(buf.size()==face_ovol*rNsimd);
-
-  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
-  //Let's make it work on GPU and then make a special accelerator_for that
-  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
-  //for cross platform
-  // FIXME -- can put internal indices into thread loop
-  auto buf_p = & buf[0];
-  autoView(lat_v, lat, AcceleratorWrite);
-  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
-
-    // scalar layout won't coalesce
-#ifdef GRID_SIMT
-      {
-	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
-#else
-      for(int blane=0;blane<Nsimd;blane++) {
-#endif
-	int olane=blane%rNsimd;               // reduced lattice lane
-	int obit =blane/rNsimd;
-
-	///////////////////////////////////////////////////////////////
-	// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
-	///////////////////////////////////////////////////////////////
-	int ssp = ss*simd[dim]+obit;
-	int b    = ssp%block;
-	int n    = ssp/block;
-	int osite= b+n*stride + ox*block;
-	
-	////////////////////////////////////////////
-	// isite -- map lane within buffer to lane within lattice
-	////////////////////////////////////////////
-	Coordinate icoor;
-	int lane;
-	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
-	icoor[dim]=ix;
-	Lexicographic::IndexFromCoor(icoor,lane,simd);
-	
-	///////////////////////////////////////////
-	// Transfer into lattice - will coalesce
-	///////////////////////////////////////////
-	//	sobj obj = extractLane(blane,buf_p[ss+offset]);
-	//	insertLane(lane,lat_v[osite],obj);
-	const int words=sizeof(vobj)/sizeof(vector_type);
-	vector_type * from = (vector_type *)&buf_p[ss+offset];
-	vector_type * to   = (vector_type *)&lat_v[osite];
-	scalar_type stmp;
-	for(int w=0;w<words;w++){
-	  stmp = getlane(from[w], blane);
-	  putlane(to[w], stmp, lane);
-	}
-      }
-  });
-}
-
-template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
-					     const Lattice<vobj> &lat,
-					     int x,
-					     int dim,
-					     int offset=0)
-{
-  const int Nsimd=vobj::Nsimd();
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  autoView(lat_v, lat, AcceleratorRead);
-
-  GridBase *grid = lat.Grid();
-  Coordinate simd = grid->_simd_layout;
-  int Nd          = grid->Nd();
-  int block       = grid->_slice_block[dim];
-  int stride      = grid->_slice_stride[dim];
-  int nblock      = grid->_slice_nblock[dim];
-  int rd          = grid->_rdimensions[dim];
-
-  int ox = x%rd;
-  int ix = x/rd;
-
-  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
-
-  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
-
-  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
-  
-  int face_ovol=block*nblock;
-
-  //  assert(buf.size()==face_ovol*rNsimd);
-
-  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
-  //Let's make it work on GPU and then make a special accelerator_for that
-  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
-  //for cross platform
-  //For CPU perhaps just run a loop over Nsimd
-  auto buf_p = & buf[0];
-  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
-
-    // scalar layout won't coalesce
-#ifdef GRID_SIMT
-      {
-	int blane=acceleratorSIMTlane(Nsimd); // buffer lane
-#else
-      for(int blane=0;blane<Nsimd;blane++) {
-#endif
-	int olane=blane%rNsimd;               // reduced lattice lane
-	int obit =blane/rNsimd;
-	
-	////////////////////////////////////////////
-	// osite
-	////////////////////////////////////////////
-	int ssp = ss*simd[dim]+obit;
-	int b    = ssp%block;
-	int n    = ssp/block;
-	int osite= b+n*stride + ox*block;
-
-	////////////////////////////////////////////
-	// isite -- map lane within buffer to lane within lattice
-	////////////////////////////////////////////
-	Coordinate icoor;
-	int lane;
-	Lexicographic::CoorFromIndex(icoor,olane,rsimd);
-	icoor[dim]=ix;
-	Lexicographic::IndexFromCoor(icoor,lane,simd);
-	
-	///////////////////////////////////////////
-	// Take out of lattice
-	///////////////////////////////////////////
-	//	sobj obj = extractLane(lane,lat_v[osite]);
-	//	insertLane(blane,buf_p[ss+offset],obj);
-	const int words=sizeof(vobj)/sizeof(vector_type);
-	vector_type * to    = (vector_type *)&buf_p[ss+offset];
-	vector_type * from  = (vector_type *)&lat_v[osite];
-	scalar_type stmp;
-	for(int w=0;w<words;w++){
-	  stmp = getlane(from[w], lane);
-	  putlane(to[w], stmp, blane);
-	}
-      }
-  });
-}
-
-
-class PaddedCell {
-public:
-  GridCartesian * unpadded_grid;
-  int dims;
-  int depth;
-  std::vector<GridCartesian *> grids;
-
-  ~PaddedCell()
-  {
-    DeleteGrids();
-  }
-  PaddedCell(int _depth,GridCartesian *_grid)
-  {
-    unpadded_grid = _grid;
-    depth=_depth;
-    dims=_grid->Nd();
-    AllocateGrids();
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate procs     =unpadded_grid->ProcessorGrid();
-    for(int d=0;d<dims;d++){
-      if ( procs[d] > 1 ) assert(local[d]>=depth);
-    }
-  }
-  void DeleteGrids(void)
-  {
-    Coordinate processors=unpadded_grid->_processors;
-    for(int d=0;d<grids.size();d++){
-      if ( processors[d] > 1 ) { 
-	delete grids[d];
-      }
-    }
-    grids.resize(0);
-  };
-  void AllocateGrids(void)
-  {
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate simd      =unpadded_grid->_simd_layout;
-    Coordinate processors=unpadded_grid->_processors;
-    Coordinate plocal    =unpadded_grid->LocalDimensions();
-    Coordinate global(dims);
-    GridCartesian *old_grid = unpadded_grid;
-    // expand up one dim at a time
-    for(int d=0;d<dims;d++){
-
-      if ( processors[d] > 1 ) { 
-	plocal[d] += 2*depth; 
-      
-	for(int d=0;d<dims;d++){
-	  global[d] = plocal[d]*processors[d];
-	}
-
-	old_grid = new GridCartesian(global,simd,processors);
-      }
-      grids.push_back(old_grid);
-    }
-  };
-  template<class vobj>
-  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
-  {
-    Coordinate processors=unpadded_grid->_processors;
-
-    Lattice<vobj> out(unpadded_grid);
-
-    Coordinate local     =unpadded_grid->LocalDimensions();
-    // depends on the MPI spread      
-    Coordinate fll(dims,depth);
-    Coordinate tll(dims,0); // depends on the MPI spread
-    for(int d=0;d<dims;d++){
-      if( processors[d]==1 ) fll[d]=0;
-    }
-    localCopyRegion(in,out,fll,tll,local);
-    return out;
-  }
-  template<class vobj>
-  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
-  {
-    GridBase *old_grid = in.Grid();
-    int dims = old_grid->Nd();
-    Lattice<vobj> tmp = in;
-    for(int d=0;d<dims;d++){
-      tmp = Expand(d,tmp,cshift); // rvalue && assignment
-    }
-    return tmp;
-  }
-  template<class vobj>
-  inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in) const
-  {
-    GridBase *old_grid = in.Grid();
-    int dims = old_grid->Nd();
-    Lattice<vobj> tmp = in;
-    for(int d=0;d<dims;d++){
-      tmp = ExpandPeriodic(d,tmp); // rvalue && assignment
-    }
-    return tmp;
-  }
-  // expand up one dim at a time
-  template<class vobj>
-  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
-  {
-    Coordinate processors=unpadded_grid->_processors;
-    GridBase *old_grid = in.Grid();
-    GridCartesian *new_grid = grids[dim];//These are new grids
-    Lattice<vobj>  padded(new_grid);
-    Lattice<vobj> shifted(old_grid);    
-    Coordinate local     =old_grid->LocalDimensions();
-    Coordinate plocal    =new_grid->LocalDimensions();
-    if(dim==0) conformable(old_grid,unpadded_grid);
-    else       conformable(old_grid,grids[dim-1]);
-
-    double tins=0, tshift=0;
-
-    int islocal = 0 ;
-    if ( processors[dim] == 1 ) islocal = 1;
-
-    if ( islocal ) {
-
-      // replace with a copy and maybe grid swizzle
-      // return in;??
-      double t = usecond();
-      padded = in;
-      tins += usecond() - t;
-      
-    } else {
-
-      //////////////////////////////////////////////
-      // Replace sequence with
-      // ---------------------
-      // (i) Gather high face(s); start comms
-      // (ii) Gather low  face(s); start comms
-      // (iii) Copy middle bit with localCopyRegion
-      // (iv) Complete high face(s), insert slice(s)
-      // (iv) Complete low  face(s), insert slice(s)
-      //////////////////////////////////////////////
-      // Middle bit
-      double t = usecond();
-      for(int x=0;x<local[dim];x++){
-	InsertSliceLocal(in,padded,x,depth+x,dim);
-      }
-      tins += usecond() - t;
-    
-      // High bit
-      t = usecond();
-      shifted = cshift.Cshift(in,dim,depth);
-      tshift += usecond() - t;
-
-      t=usecond();
-      for(int x=0;x<depth;x++){
-	InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
-      }
-      tins += usecond() - t;
-    
-      // Low bit
-      t = usecond();
-      shifted = cshift.Cshift(in,dim,-depth);
-      tshift += usecond() - t;
-    
-      t = usecond();
-      for(int x=0;x<depth;x++){
-	InsertSliceLocal(shifted,padded,x,x,dim);
-      }
-      tins += usecond() - t;
-
-    }
-    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
-    
-    return padded;
-  }
-
-  template<class vobj>
-  inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in) const
-  {
-    Coordinate processors=unpadded_grid->_processors;
-    GridBase *old_grid = in.Grid();
-    GridCartesian *new_grid = grids[dim];//These are new grids
-    Lattice<vobj>  padded(new_grid);
-    //    Lattice<vobj> shifted(old_grid);    
-    Coordinate local     =old_grid->LocalDimensions();
-    Coordinate plocal    =new_grid->LocalDimensions();
-    if(dim==0) conformable(old_grid,unpadded_grid);
-    else       conformable(old_grid,grids[dim-1]);
-
-    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
-    double tins=0, tshift=0;
-
-    int islocal = 0 ;
-    if ( processors[dim] == 1 ) islocal = 1;
-
-    if ( islocal ) {
-      padded=in; // slightly different interface could avoid a copy operation
-    } else {
-      Face_exchange(in,padded,dim,depth);
-      return padded;
-    }
-    return padded;
-  }
-  template<class vobj>
-  void Face_exchange(const Lattice<vobj> &from,
-		     Lattice<vobj> &to,
-		     int dimension,int depth) const
-  {
-    typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::scalar_object sobj;
-
-    RealD t_gather=0.0;
-    RealD t_scatter=0.0;
-    RealD t_comms=0.0;
-    RealD t_copy=0.0;
-    
-    //    std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
-    //    DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
-    GridBase *grid=from.Grid();
-    GridBase *new_grid=to.Grid();
-
-    Coordinate lds = from.Grid()->_ldimensions;
-    Coordinate nlds=   to.Grid()->_ldimensions;
-    Coordinate simd= from.Grid()->_simd_layout;
-    int ld    = lds[dimension];
-    int nld   = to.Grid()->_ldimensions[dimension];
-    const int Nsimd = vobj::Nsimd();
-
-    assert(depth<=lds[dimension]); // A must be on neighbouring node
-    assert(depth>0);   // A caller bug if zero
-    assert(ld+2*depth==nld);
-    ////////////////////////////////////////////////////////////////////////////
-    // Face size and byte calculations
-    ////////////////////////////////////////////////////////////////////////////
-    int buffer_size = 1;
-    for(int d=0;d<lds.size();d++){
-      if ( d!= dimension) buffer_size=buffer_size*lds[d];
-    }
-    buffer_size = buffer_size  / Nsimd;
-    int rNsimd = Nsimd / simd[dimension];
-    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
-
-    static cshiftVector<vobj> send_buf; 
-    static cshiftVector<vobj> recv_buf;
-    send_buf.resize(buffer_size*2*depth);    
-    recv_buf.resize(buffer_size*2*depth);
-
-    std::vector<CommsRequest_t> fwd_req;   
-    std::vector<CommsRequest_t> bwd_req;   
-
-    int words = buffer_size;
-    int bytes = words * sizeof(vobj);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Communication coords
-    ////////////////////////////////////////////////////////////////////////////
-    int comm_proc = 1;
-    int xmit_to_rank;
-    int recv_from_rank;
-    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Gather all surface terms up to depth "d"
-    ////////////////////////////////////////////////////////////////////////////
-    RealD t;
-    RealD t_tot=-usecond();
-    int plane=0;
-    for ( int d=0;d < depth ; d ++ ) {
-      int tag = d*1024 + dimension*2+0;
-
-      t=usecond();
-      GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
-      t_gather+=usecond()-t;
-
-      t=usecond();
-      grid->SendToRecvFromBegin(fwd_req,
-				(void *)&send_buf[d*buffer_size], xmit_to_rank,
-				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
-      t_comms+=usecond()-t;
-     }
-    for ( int d=0;d < depth ; d ++ ) {
-      int tag = d*1024 + dimension*2+1;
-
-      t=usecond();
-      GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
-      t_gather+= usecond() - t;
-
-      t=usecond();
-      grid->SendToRecvFromBegin(bwd_req,
-				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
-				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
-      t_comms+=usecond()-t;
-    }
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Copy interior -- overlap this with comms
-    ////////////////////////////////////////////////////////////////////////////
-    int Nd = new_grid->Nd();
-    Coordinate LL(Nd,0);
-    Coordinate sz = grid->_ldimensions;
-    Coordinate toLL(Nd,0);
-    toLL[dimension]=depth;
-    t=usecond();
-    localCopyRegion(from,to,LL,toLL,sz);
-    t_copy= usecond() - t;
-    
-    ////////////////////////////////////////////////////////////////////////////
-    // Scatter all faces
-    ////////////////////////////////////////////////////////////////////////////
-    plane=0;
-
-    t=usecond();
-    grid->CommsComplete(fwd_req);
-    t_comms+= usecond() - t;
-
-    t=usecond();
-    for ( int d=0;d < depth ; d ++ ) {
-      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
-    }
-    t_scatter= usecond() - t;
-
-    t=usecond();
-    grid->CommsComplete(bwd_req);
-    t_comms+= usecond() - t;
-    
-    t=usecond();
-    for ( int d=0;d < depth ; d ++ ) {
-      ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
-    }
-    t_scatter+= usecond() - t;
-    t_tot+=usecond();
-
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
-    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes  :" << depth*bytes/1e6 << "MB"<<std::endl;
-  }
-  
-};
- 
-
-NAMESPACE_END(Grid);
-
-
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -65,40 +65,32 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
-GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
-GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
-GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");

 void GridLogConfigure(std::vector<std::string> &logstreams) {
-  GridLogError.Active(1);
+  GridLogError.Active(0);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
-  GridLogMemory.Active(0); 
-  GridLogTracing.Active(0); 
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
-  GridLogDslash.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);

  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Tracing"))     GridLogTracing.Active(1);
-    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
+    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
-    if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
-    if (logstreams[i] == std::string("NoHMC"))       GridLogHMC.Active(0);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -138,8 +138,7 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      //      stream << log.colour() <<  std::left;
-      stream <<  std::left;
+      stream << log.colour() <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
@@ -154,9 +153,9 @@ public:
 	stream << log.evidence()
 	       << now	       << log.background() << " : " ;
      }
-      //      stream << log.colour();
-      stream <<  std::right;
+      stream << log.colour();
      stream.flags(f);
+
      return stream;
    } else { 
      return devnull;
@@ -181,12 +180,9 @@ extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
-extern GridLogger GridLogDslash;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
-extern GridLogger GridLogMemory;
-extern GridLogger GridLogTracing;
 extern Colours    GridLogColours;

 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -165,7 +165,7 @@ class BinaryIO {
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	
-	int64_t global_site;
+	int global_site;

 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);

@@ -175,8 +175,8 @@ class BinaryIO {

 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);

-	uint64_t gsite29   = global_site%29;
-	uint64_t gsite31   = global_site%31;
+	uint32_t gsite29   = global_site%29;
+	uint32_t gsite31   = global_site%31;
 	
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@@ -545,9 +545,7 @@ class BinaryIO {
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
-				       uint32_t &scidac_csumb,
-				       int control=BINARYIO_LEXICOGRAPHIC
-				       )
+				       uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -558,7 +556,7 @@ class BinaryIO {
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control,
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);

    GridStopWatch timer; 
@@ -584,8 +582,7 @@ class BinaryIO {
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
-					  uint32_t &scidac_csumb,
-					  int control=BINARYIO_LEXICOGRAPHIC)
+					  uint32_t &scidac_csumb)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@@ -610,7 +607,7 @@ class BinaryIO {
    while (attemptsLeft >= 0)
    {
      grid->Barrier();
-      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control,
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	             nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
@@ -620,7 +617,7 @@ class BinaryIO {

        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
-        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control,
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -31,7 +31,6 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
-#include <string>
 #include <map>

 #include <pwd.h>
@@ -206,7 +205,7 @@ class GridLimeReader : public BinaryIO {
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
-  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
@@ -238,7 +237,7 @@ class GridLimeReader : public BinaryIO {
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control);
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
 	std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
@@ -408,7 +407,7 @@ class GridLimeWriter : public BinaryIO
  // in communicator used by the field.Grid()
  ////////////////////////////////////////////////////
  template<class vobj>
-  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
@@ -459,7 +458,7 @@ class GridLimeWriter : public BinaryIO
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);

    ///////////////////////////////////////////
    // Wind forward and close the record
@@ -512,8 +511,7 @@ class ScidacWriter : public GridLimeWriter {
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
-                              const unsigned int recordScientificPrec = 0,
-			      int control=BINARYIO_LEXICOGRAPHIC)
+                              const unsigned int recordScientificPrec = 0) 
  {
    GridBase * grid = field.Grid();

@@ -535,7 +533,7 @@ class ScidacWriter : public GridLimeWriter {
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
-    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);      // Closes message with checksum
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
  }
 };

@@ -554,8 +552,7 @@ class ScidacReader : public GridLimeReader {
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
-  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord,
-			     int control=BINARYIO_LEXICOGRAPHIC) 
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field.Grid();
@@ -573,14 +570,12 @@ class ScidacReader : public GridLimeReader {
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
-  // in principle should do the line below, but that breaks backard compatibility with old data
-  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
@@ -657,8 +652,7 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    const std::string stNC = std::to_string( Nc ) ;
-    ildgfmt.field          = std::string("su"+stNC+"gauge");
+    ildgfmt.field     = std::string("su3gauge");

    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -875,8 +869,7 @@ class IldgReader : public GridLimeReader {
    } else { 

      assert(found_ildgFormat);
-      const std::string stNC = std::to_string( Nc ) ;
-      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
+      assert ( ildgFormat_.field == std::string("su3gauge") );

      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -884,7 +877,7 @@ class IldgReader : public GridLimeReader {

      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");

      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -6,8 +6,8 @@

    Copyright (C) 2015

+
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@@ -203,24 +203,20 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  assert( Nc < 4 && Nc > 1 ) ;
+  const int x=0;
+  const int y=1;
+  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    #if Nc == 2
-      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
-      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
-    #else
-      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
-      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
-    #endif
+    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;

 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -282,6 +278,7 @@ struct GaugeSimpleMunger{

 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
+
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -320,8 +317,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<Nc;j++){
+      for(int i=0;i<2;i++){
+	for(int j=0;j<3;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -333,8 +330,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<Nc;j++){
+      for(int i=0;i<2;i++){
+	for(int j=0;j<3;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -9,7 +9,6 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,8 +30,6 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H

-#include <string>
-
 NAMESPACE_BEGIN(Grid);

 using namespace Grid;
@@ -150,17 +147,15 @@ public:

    std::string format(header.floating_point);

-    const int ieee32big = (format == std::string("IEEE32BIG"));
-    const int ieee32    = (format == std::string("IEEE32"));
-    const int ieee64big = (format == std::string("IEEE64BIG"));
-    const int ieee64    = (format == std::string("IEEE64") || \
-			   format == std::string("IEEE64LITTLE"));
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    const std::string stNC = std::to_string( Nc ) ;
-    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
+    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -171,7 +166,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
+    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -216,29 +211,27 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"),
-					std::string ens_id = std::string("UKQCD"),
-					unsigned int sequence_number = 1)
+					std::string ens_label = std::string("DWF"))
  {
-    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
+    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"),
-					std::string ens_id = std::string("UKQCD"),
-					unsigned int sequence_number = 1)
+					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    FieldMetaData header;
-    header.sequence_number = sequence_number;
-    header.ensemble_id     = ens_id;
+    ///////////////////////////////////////////
+    // Following should become arguments
+    ///////////////////////////////////////////
+    header.sequence_number = 1;
+    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
-    header.hdr_version     = "1.0" ;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -252,14 +245,10 @@ public:

    uint64_t offset;

-    // Sod it -- always write NcxNc double
-    header.floating_point  = std::string("IEEE64BIG");
-    const std::string stNC = std::to_string( Nc ) ;
-    if( two_row ) {
-      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
-    } else {
-      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
-    }
+    // Sod it -- always write 3x3 double
+    header.floating_point = std::string("IEEE64BIG");
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+    GaugeSimpleUnmunger<fobj3D,sobj> munge;
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -267,15 +256,8 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    if( two_row ) {
-      Gauge3x2unmunger<fobj2D,sobj> munge;
-      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
-						nersc_csum,scidac_csuma,scidac_csumb);
-    } else {
-      GaugeSimpleUnmunger<fobj3D,sobj> munge;
-      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-						nersc_csum,scidac_csuma,scidac_csumb);
-    }
+    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+					      nersc_csum,scidac_csuma,scidac_csumb);
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -307,7 +289,8 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);

-    uint64_t offset;
+	uint64_t offset;
+  
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@@ -347,7 +330,7 @@ public:

    GridBase *grid = parallel.Grid();

-    uint64_t offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);

    FieldMetaData clone(header);

--- a/Grid/perfmon/PerfCount.cc
+++ b/Grid/perfmon/PerfCount.cc
@@ -27,12 +27,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */

 #include <Grid/GridCore.h>
-
-#include <Grid/perfmon/Timer.h>
 #include <Grid/perfmon/PerfCount.h>
-NAMESPACE_BEGIN(Grid);

-GridTimePoint theProgramStart = GridClock::now();
+NAMESPACE_BEGIN(Grid);

 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
--- a/Grid/perfmon/PerfCount.h
+++ b/Grid/perfmon/PerfCount.h
@@ -30,12 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_PERFCOUNT_H
 #define GRID_PERFCOUNT_H

-
-#ifndef __SSC_START
-#define __SSC_START
-#define __SSC_STOP
-#endif
-
 #include <sys/time.h>
 #include <ctime>
 #include <chrono>
@@ -78,9 +72,17 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 inline uint64_t cyclecount(void){ 
  return 0;
 }
+#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
+#define __SSC_STOP  __SSC_MARK(0x110)
+#define __SSC_START __SSC_MARK(0x111)
+

 #else

+#define __SSC_MARK(mark) 
+#define __SSC_STOP  
+#define __SSC_START 
+
 /*
 * cycle counters arch dependent
 */
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -35,8 +35,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid)

-//typedef  std::chrono::system_clock          GridClock;
-typedef  std::chrono::high_resolution_clock   GridClock;
+// Dress the output; use std::chrono
+// C++11 time facilities better?
+inline double usecond(void) {
+  struct timeval tv;
+#ifdef TIMERS_ON
+  gettimeofday(&tv,NULL);
+#endif
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}
+
+typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;

 typedef  std::chrono::seconds               GridSecs;
@@ -44,15 +53,6 @@ typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;

-extern GridTimePoint theProgramStart;
-// Dress the output; use std::chrono
-// C++11 time facilities better?
-inline double usecond(void) {
-  auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart); 
-  return 1.0*usecs.count();
-}
-
-
 inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
  stream << time.count()<<" s";
--- a/Grid/perfmon/Tracing.h
+++ b/Grid/perfmon/Tracing.h
@@ -1,70 +0,0 @@
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-#ifdef GRID_TRACING_NVTX
-#include <nvToolsExt.h>
-class GridTracer {
-public:
-  GridTracer(const char* name) {
-    nvtxRangePushA(name);
-  }
-  ~GridTracer() {
-    nvtxRangePop();
-  }
-};
-inline void tracePush(const char *name) { nvtxRangePushA(name); }
-inline void tracePop(const char *name) { nvtxRangePop(); }
-inline int  traceStart(const char *name) {  }
-inline void traceStop(int ID) {  }
-#endif
-
-#ifdef GRID_TRACING_ROCTX
-#include <roctracer/roctx.h>
-class GridTracer {
- public:
-  GridTracer(const char* name) {
-    roctxRangePushA(name);
-    std::cout << "roctxRangePush "<<name<<std::endl;
-  }
-  ~GridTracer() {
-    roctxRangePop();
-    std::cout << "roctxRangePop "<<std::endl;
-  }
-};
-inline void tracePush(const char *name) { roctxRangePushA(name); }
-inline void tracePop(const char *name) { roctxRangePop(); }
-inline int  traceStart(const char *name) { roctxRangeStart(name); }
-inline void traceStop(int ID) { roctxRangeStop(ID); }
-#endif
-
-#ifdef GRID_TRACING_TIMER
-class GridTracer {
- public:
-  const char *name;
-  double elapsed;
-  GridTracer(const char* _name) {
-    name = _name;
-    elapsed=-usecond();
-  }
-  ~GridTracer() {
-    elapsed+=usecond();
-    std::cout << GridLogTracing << name << " took " <<elapsed<< " us" <<std::endl;
-  }
-};
-inline void tracePush(const char *name) {  }
-inline void tracePop(const char *name) {  }
-inline int  traceStart(const char *name) { return 0; }
-inline void traceStop(int ID) {  }
-#endif
-
-#ifdef GRID_TRACING_NONE
-#define GRID_TRACE(name) 
-inline void tracePush(const char *name) {  }
-inline void tracePop(const char *name) {  }
-inline int  traceStart(const char *name) { return 0;  }
-inline void traceStop(int ID) {  }
-#else
-#define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
-#endif
-NAMESPACE_END(Grid);
--- a/Grid/pugixml/pugixml.cc
+++ b/Grid/pugixml/pugixml.cc
@@ -16,12 +16,8 @@

 #ifdef __NVCC__
 #pragma push
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
-#else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
-#endif

 #include "pugixml.h"

--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -104,7 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ;
+template<typename vtype> using iLorentzVector             = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@@ -127,7 +127,6 @@ typedef iSpinMatrix<ComplexD >          SpinMatrixD;
 typedef iSpinMatrix<vComplex >          vSpinMatrix;
 typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
 typedef iSpinMatrix<vComplexD>          vSpinMatrixD;
-typedef iSpinMatrix<vComplexD2>         vSpinMatrixD2;

 // Colour Matrix
 typedef iColourMatrix<Complex  >        ColourMatrix;
@@ -137,7 +136,6 @@ typedef iColourMatrix<ComplexD >        ColourMatrixD;
 typedef iColourMatrix<vComplex >        vColourMatrix;
 typedef iColourMatrix<vComplexF>        vColourMatrixF;
 typedef iColourMatrix<vComplexD>        vColourMatrixD;
-typedef iColourMatrix<vComplexD2>       vColourMatrixD2;

 // SpinColour matrix
 typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
@@ -147,7 +145,6 @@ typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
 typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
-typedef iSpinColourMatrix<vComplexD2>   vSpinColourMatrixD2;

 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@@ -157,7 +154,6 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
-typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;

 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@@ -167,46 +163,42 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
-typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;

-// LorentzColour
+// LorentzVector
+typedef iLorentzVector<Complex  > LorentzVector;
+typedef iLorentzVector<ComplexF > LorentzVectorF;
+typedef iLorentzVector<ComplexD > LorentzVectorD;
+
+typedef iLorentzVector<vComplex > vLorentzVector;
+typedef iLorentzVector<vComplexF> vLorentzVectorF;
+typedef iLorentzVector<vComplexD> vLorentzVectorD;
+
+// LorentzColourMatrix
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
 typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;

-typedef iLorentzColourMatrix<vComplex >  vLorentzColourMatrix;
-typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF;
-typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD;
-typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
-
-// LorentzComplex
-typedef iLorentzComplex<Complex  > LorentzComplex;
-typedef iLorentzComplex<ComplexF > LorentzComplexF;
-typedef iLorentzComplex<ComplexD > LorentzComplexD;
-
-typedef iLorentzComplex<vComplex > vLorentzComplex;
-typedef iLorentzComplex<vComplexF> vLorentzComplexF;
-typedef iLorentzComplex<vComplexD> vLorentzComplexD;
+typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
+typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
+typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;

 // DoubleStored gauge field
 typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;

-typedef iDoubleStoredColourMatrix<vComplex >  vDoubleStoredColourMatrix;
-typedef iDoubleStoredColourMatrix<vComplexF>  vDoubleStoredColourMatrixF;
-typedef iDoubleStoredColourMatrix<vComplexD>  vDoubleStoredColourMatrixD;
-typedef iDoubleStoredColourMatrix<vComplexD2> vDoubleStoredColourMatrixD2;
+typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
+typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
+typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;

 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;

-typedef iGparityFlavourMatrix<vComplex>   vGparityFlavourMatrix;
-typedef iGparityFlavourMatrix<vComplexF>  vGparityFlavourMatrixF;
-typedef iGparityFlavourMatrix<vComplexD>  vGparityFlavourMatrixD;
-typedef iGparityFlavourMatrix<vComplexD2> vGparityFlavourMatrixD2;
+typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
+typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;


 // Spin vector
@@ -217,7 +209,6 @@ typedef iSpinVector<ComplexD>           SpinVectorD;
 typedef iSpinVector<vComplex >           vSpinVector;
 typedef iSpinVector<vComplexF>           vSpinVectorF;
 typedef iSpinVector<vComplexD>           vSpinVectorD;
-typedef iSpinVector<vComplexD2>          vSpinVectorD2;

 // Colour vector
 typedef iColourVector<Complex >         ColourVector;
@@ -227,7 +218,6 @@ typedef iColourVector<ComplexD>         ColourVectorD;
 typedef iColourVector<vComplex >         vColourVector;
 typedef iColourVector<vComplexF>         vColourVectorF;
 typedef iColourVector<vComplexD>         vColourVectorD;
-typedef iColourVector<vComplexD2>        vColourVectorD2;

 // SpinColourVector
 typedef iSpinColourVector<Complex >     SpinColourVector;
@@ -237,7 +227,6 @@ typedef iSpinColourVector<ComplexD>     SpinColourVectorD;
 typedef iSpinColourVector<vComplex >     vSpinColourVector;
 typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
 typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;
-typedef iSpinColourVector<vComplexD2>    vSpinColourVectorD2;

 // HalfSpin vector
 typedef iHalfSpinVector<Complex >       HalfSpinVector;
@@ -247,17 +236,15 @@ typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;
 typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
 typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
 typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;
-typedef iHalfSpinVector<vComplexD2>      vHalfSpinVectorD2;

 // HalfSpinColour vector
 typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
 typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
 typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
    
-typedef iHalfSpinColourVector<vComplex >  vHalfSpinColourVector;
-typedef iHalfSpinColourVector<vComplexF>  vHalfSpinColourVectorF;
-typedef iHalfSpinColourVector<vComplexD>  vHalfSpinColourVectorD;
-typedef iHalfSpinColourVector<vComplexD2> vHalfSpinColourVectorD2;
+typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
+typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
+typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;

 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
@@ -267,7 +254,7 @@ typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
-typedef iGparityFlavourVector<vComplexD2>        vGparityFlavourVectorD2;
+
    
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@@ -277,7 +264,6 @@ typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tenso
 typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure
-typedef iSinglet<vComplexD2>       vTComplexD2;   // what if we don't know the tensor structure

 typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
 typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
@@ -295,62 +281,51 @@ typedef iSinglet<Integer >         TInteger;
 typedef Lattice<vColourMatrix>          LatticeColourMatrix;
 typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
 typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;
-typedef Lattice<vColourMatrixD2>        LatticeColourMatrixD2;

 typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
 typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
 typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;
-typedef Lattice<vSpinMatrixD2>          LatticeSpinMatrixD2;

 typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
 typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
 typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
-typedef Lattice<vSpinColourMatrixD2>    LatticeSpinColourMatrixD2;

 typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
 typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
 typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
-typedef Lattice<vSpinColourSpinColourMatrixD2>    LatticeSpinColourSpinColourMatrixD2;

-typedef Lattice<vLorentzColourMatrix>   LatticeLorentzColourMatrix;
-typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF;
-typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD;
-typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
+typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
+typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
+typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;

-typedef Lattice<vLorentzComplex>  LatticeLorentzComplex;
-typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
-typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
+typedef Lattice<vLorentzVector>  LatticeLorentzVector;
+typedef Lattice<vLorentzVectorF> LatticeLorentzVectorF;
+typedef Lattice<vLorentzVectorD> LatticeLorentzVectorD;

 // DoubleStored gauge field
-typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix;
-typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF;
-typedef Lattice<vDoubleStoredColourMatrixD>  LatticeDoubleStoredColourMatrixD;
-typedef Lattice<vDoubleStoredColourMatrixD2> LatticeDoubleStoredColourMatrixD2;
+typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
+typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
+typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;

 typedef Lattice<vSpinVector>            LatticeSpinVector;
 typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
 typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;
-typedef Lattice<vSpinVectorD2>          LatticeSpinVectorD2;

 typedef Lattice<vColourVector>          LatticeColourVector;
 typedef Lattice<vColourVectorF>         LatticeColourVectorF;
 typedef Lattice<vColourVectorD>         LatticeColourVectorD;
-typedef Lattice<vColourVectorD2>        LatticeColourVectorD2;

 typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
 typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
 typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;
-typedef Lattice<vSpinColourVectorD2>    LatticeSpinColourVectorD2;

 typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
 typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
 typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;
-typedef Lattice<vHalfSpinVectorD2>      LatticeHalfSpinVectorD2;

-typedef Lattice<vHalfSpinColourVector>   LatticeHalfSpinColourVector;
-typedef Lattice<vHalfSpinColourVectorF>  LatticeHalfSpinColourVectorF;
-typedef Lattice<vHalfSpinColourVectorD>  LatticeHalfSpinColourVectorD;
-typedef Lattice<vHalfSpinColourVectorD2> LatticeHalfSpinColourVectorD2;
+typedef Lattice<vHalfSpinColourVector>  LatticeHalfSpinColourVector;
+typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
+typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;

 typedef Lattice<vTReal>            LatticeReal;
 typedef Lattice<vTRealF>           LatticeRealF;
@@ -359,7 +334,6 @@ typedef Lattice<vTRealD>           LatticeRealD;
 typedef Lattice<vTComplex>         LatticeComplex;
 typedef Lattice<vTComplexF>        LatticeComplexF;
 typedef Lattice<vTComplexD>        LatticeComplexD;
-typedef Lattice<vTComplexD2>       LatticeComplexD2;

 typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"

@@ -367,42 +341,37 @@ typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
 ///////////////////////////////////////////
 // Physical names for things
 ///////////////////////////////////////////
-typedef LatticeHalfSpinColourVector   LatticeHalfFermion;
-typedef LatticeHalfSpinColourVectorF  LatticeHalfFermionF;
-typedef LatticeHalfSpinColourVectorD  LatticeHalfFermionD;
-typedef LatticeHalfSpinColourVectorD2 LatticeHalfFermionD2;
+typedef LatticeHalfSpinColourVector  LatticeHalfFermion;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
+typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;

 typedef LatticeSpinColourVector      LatticeFermion;
 typedef LatticeSpinColourVectorF     LatticeFermionF;
 typedef LatticeSpinColourVectorD     LatticeFermionD;
-typedef LatticeSpinColourVectorD2    LatticeFermionD2;

 typedef LatticeSpinColourMatrix                LatticePropagator;
 typedef LatticeSpinColourMatrixF               LatticePropagatorF;
 typedef LatticeSpinColourMatrixD               LatticePropagatorD;
-typedef LatticeSpinColourMatrixD2              LatticePropagatorD2;

 typedef LatticeLorentzColourMatrix             LatticeGaugeField;
 typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
 typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;
-typedef LatticeLorentzColourMatrixD2           LatticeGaugeFieldD2;

 typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
 typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
 typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;
-typedef LatticeDoubleStoredColourMatrixD2      LatticeDoubledGaugeFieldD2;

 template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;

+// Uhgg... typing this hurt  ;)
+// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
 typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
 typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
 typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    
-typedef Lattice<vColourVectorD2>        LatticeStaggeredFermionD2;    

 typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
 typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
 typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 
-typedef Lattice<vColourMatrixD2>        LatticeStaggeredPropagatorD2; 

 //////////////////////////////////////////////////////////////////////////////
 // Peek and Poke named after physics attributes
@@ -521,20 +490,9 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
 //template <class Prop, class Ferm>
-#define FAST_FERM_TO_PROP
 template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
-#ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,CpuWrite);
-  autoView(f_v,f,CpuRead);
-  thread_for(idx,p_v.oSites(),{
-      for(int ss = 0; ss < Ns; ++ss) {
-      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
-	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
-      }}
-    });
-#else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -546,23 +504,12 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 	}
      pokeSpin(p, pjs, j, s);
    }
-#endif
 }
    
 //template <class Prop, class Ferm>
 template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
-#ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,CpuRead);
-  autoView(f_v,f,CpuWrite);
-  thread_for(idx,p_v.oSites(),{
-      for(int ss = 0; ss < Ns; ++ss) {
-      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
-	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
-      }}
-    });
-#else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -574,7 +521,6 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 	}
      pokeSpin(f, fj, j);
    }
-#endif
 }
    
 //////////////////////////////////////////////
--- a/Grid/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@@ -30,8 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef GRID_QCD_ACTION_H
-#define GRID_QCD_ACTION_H
+#pragma once

 ////////////////////////////////////////////
 // Abstract base interface
@@ -51,4 +50,4 @@ NAMESPACE_CHECK(Fermion);
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
 NAMESPACE_CHECK(PseudoFermion);

-#endif
+
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -34,96 +34,39 @@ directory

 NAMESPACE_BEGIN(Grid);

-///////////////////////////////////
-// Smart configuration base class
-///////////////////////////////////
-template< class Field >
-class ConfigurationBase
-{
-public:
-  ConfigurationBase() {}
-  virtual ~ConfigurationBase() {}
-  virtual void set_Field(Field& U) =0;
-  virtual void smeared_force(Field&) = 0;
-  virtual Field& get_SmearedU() =0;
-  virtual Field &get_U(bool smeared = false) = 0;
-};
-
 template <class GaugeField >
 class Action 
 {
+
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
-  RealD Fdt_norm_sum;
-  RealD Fdt_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
-    deriv_norm_sum = deriv_max_sum=0.0;
-    Fdt_max_sum =  Fdt_norm_sum = 0.0;
    deriv_num=0;
+    deriv_norm_sum = deriv_max_sum=0.0;
  }
-  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
-    if ( max > deriv_max_sum ) {
-      deriv_max_sum=max;
-    }
-    deriv_norm_sum+=nrm;
-    if ( Fdt_max > Fdt_max_sum ) {
-      Fdt_max_sum=Fdt_max;
-    }
-    Fdt_norm_sum+=Fdt_nrm; deriv_num++;
-  }
-  RealD deriv_max_average(void)       { return deriv_max_sum; };
-  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
-  RealD Fdt_max_average(void)         { return Fdt_max_sum; };
-  RealD Fdt_norm_average(void)        { return Fdt_norm_sum/deriv_num; };
+  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
+  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
+  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
-  RealD S_timer(void)            { return S_us; };
-  RealD refresh_timer(void)      { return refresh_us; };
+  RealD S_timer(void)            { return deriv_us; };
+  RealD refresh_timer(void)      { return deriv_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
-  /////////////////////////////
  // Heatbath?
-  /////////////////////////////
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
-  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
-
-  /////////////////////////////////////////////////////////////
-  // virtual smeared interface through configuration container
-  /////////////////////////////////////////////////////////////
-  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
-  {
-    refresh(U.get_U(is_smeared),sRNG,pRNG);
-  }
-  virtual RealD S(ConfigurationBase<GaugeField>& U)
-  {
-    return S(U.get_U(is_smeared));
-  }
-  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
-  {
-    return Sinitial(U.get_U(is_smeared));
-  }
-  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
-  {
-    deriv(U.get_U(is_smeared),dSdU); 
-    if ( is_smeared ) {
-      U.smeared_force(dSdU);
-    }
-  }
-  ///////////////////////////////
-  // Logging
-  ///////////////////////////////
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
  virtual ~Action(){}
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -30,8 +30,6 @@ directory
 #ifndef QCD_ACTION_CORE
 #define QCD_ACTION_CORE

-#include <Grid/qcd/action/gauge/GaugeImplementations.h>
-
 #include <Grid/qcd/action/ActionBase.h>
 NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
@@ -39,10 +37,6 @@ NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
 NAMESPACE_CHECK(ActionParams);

-#include <Grid/qcd/action/filters/MomentumFilter.h>
-#include <Grid/qcd/action/filters/DirichletFilter.h>
-#include <Grid/qcd/action/filters/DDHMCFilter.h>
-
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
@@ -64,6 +58,8 @@ NAMESPACE_CHECK(Scalar);
 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
+#include <Grid/qcd/action/domains/Domains.h>
+
 #include <Grid/qcd/utils/Metric.h>
 NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -34,58 +34,44 @@ directory

 NAMESPACE_BEGIN(Grid);

-
+// These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
-  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
-  GparityWilsonImplParams() : twists(Nd, 0) {
-    dirichlet.resize(0);
-    partialDirichlet=0;
-  };
+  bool locally_periodic;
+  GparityWilsonImplParams() : twists(Nd, 0), locally_periodic(false) {};
 };
  
 struct WilsonImplParams {
  bool overlapCommsCompute;
-  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  bool locally_periodic;
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
-    dirichlet.resize(0);
-    partialDirichlet=0;
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
+      locally_periodic = false;
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
-    partialDirichlet=0;
-    dirichlet.resize(0);
+    locally_periodic = false;
  }
 };

 struct StaggeredImplParams {
-  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
-  StaggeredImplParams()
-  {
-    partialDirichlet=0;
-    dirichlet.resize(0);
-  };
+  bool locally_periodic;
+  StaggeredImplParams() : locally_periodic(false) {};
 };
  
-  struct OneFlavourRationalParams : Serializable {
+struct OneFlavourRationalParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
 				    RealD, lo, 
 				    RealD, hi, 
 				    int,   MaxIter, 
 				    RealD, tolerance, 
-				    RealD, mdtolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq,
-				    RealD, BoundsCheckTol);
+				    int,   BoundsCheckFreq);
    
  // MaxIter and tolerance, vectors??
    
@@ -96,20 +82,17 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20,
-				RealD mdtol    = 1.0e-6,
-				double _BoundsCheckTol=1e-6)
+				int _BoundsCheckFreq=20)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
-        mdtolerance(mdtol),
 	degree(_degree),
        precision(_precision),
-        BoundsCheckFreq(_BoundsCheckFreq),
-        BoundsCheckTol(_BoundsCheckTol){};
+        BoundsCheckFreq(_BoundsCheckFreq){};
  };
-  
+
+
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
@@ -152,6 +135,7 @@ struct StaggeredImplParams {
  };


+  
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/domains/DDHMCFilter.h
+++ b/Grid/qcd/action/domains/DDHMCFilter.h
@@ -2,11 +2,12 @@

 Grid physics library, www.github.com/paboyle/Grid

-Source file:
+Source file: ./lib/qcd/hmc/DDHMC.h

-Copyright (C) 2015-2016
+Copyright (C) 2021

-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -25,29 +26,27 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
-#include <Grid/Grid.h>
+			   /*  END LEGAL */

-int main(int argc, char **argv)
+NAMESPACE_BEGIN(Grid);
+////////////////////////////////////////////////////
+// DDHMC filter with sub-block size B[mu]
+////////////////////////////////////////////////////
+
+template<typename MomentaField>
+struct DDHMCFilter: public MomentumFilterBase<MomentaField>
 {
-  using namespace Grid;
-
-  Grid_init(&argc, &argv);
-
-  Coordinate latt4  = GridDefaultLatt();
-  Coordinate mpi    = GridDefaultMpi();
-  Coordinate simd   = GridDefaultSimd(Nd,vComplexD::Nsimd());
-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,simd,mpi);
-
-  GridSerialRNG   sRNG;         sRNG.SeedUniqueString(std::string("The Serial RNG"));
-  GridParallelRNG pRNG(UGrid);  pRNG.SeedUniqueString(std::string("The 4D RNG"));
-
-  std::string rngfile("ckpoint_rng.0");
-  NerscIO::writeRNGState(sRNG, pRNG, rngfile);
+  Coordinate Block;
+  int Width;
  
-  Grid_finalize();
-}
+  DDHMCFilter(const Coordinate &_Block): Block(_Block) {}

+  void applyFilter(MomentaField &P) const override
+  {
+    DomainDecomposition Domains(Block);
+    Domains.ProjectDDHMC(P);
+  }
+};

+NAMESPACE_END(Grid);

--- a/Grid/qcd/action/domains/DirichletFilter.h
+++ b/Grid/qcd/action/domains/DirichletFilter.h
@@ -0,0 +1,98 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/momentum/DirichletFilter.h
+
+Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+////////////////////////////////////////////////////
+// Dirichlet filter with sub-block size B[mu]
+////////////////////////////////////////////////////
+#pragma once 
+
+#include <Grid/qcd/action/domains/DomainDecomposition.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+template<typename MomentaField>
+struct DirichletFilter: public MomentumFilterBase<MomentaField>
+{
+  Coordinate Block;
+  
+  DirichletFilter(const Coordinate &_Block): Block(_Block) {}
+
+  // Edge detect using domain projectors
+  void applyFilter (MomentaField &U) const override
+  {
+    DomainDecomposition Domains(Block);
+    GridBase *grid = U.Grid();
+    LatticeInteger  coor(grid);
+    LatticeInteger  face(grid);
+    LatticeInteger  one(grid);   one = 1;
+    LatticeInteger  zero(grid); zero = 0;
+    LatticeInteger  omega(grid);
+    LatticeInteger  omegabar(grid);
+    LatticeInteger  tmp(grid);
+
+    omega=one;    Domains.ProjectDomain(omega,0);
+    omegabar=one; Domains.ProjectDomain(omegabar,1);
+    
+    LatticeInteger nface(grid); nface=Zero();
+    
+    MomentaField projected(grid); projected=Zero();
+    typedef decltype(PeekIndex<LorentzIndex>(U,0)) MomentaLinkField;
+    MomentaLinkField  Umu(grid);
+    MomentaLinkField   zz(grid); zz=Zero();
+
+    int dims = grid->Nd();
+    Coordinate Global=grid->GlobalDimensions();
+    assert(dims==Nd);
+
+    for(int mu=0;mu<Nd;mu++){
+
+      if ( Block[mu]!=0 ) {
+
+	Umu = PeekIndex<LorentzIndex>(U,mu);
+
+	// Upper face 
+ 	tmp = Cshift(omegabar,mu,1);
+	tmp = tmp + omega;
+	face = where(tmp == Integer(2),one,zero );
+
+ 	tmp = Cshift(omega,mu,1);
+	tmp = tmp + omegabar;
+	face = where(tmp == Integer(2),one,face );
+
+	Umu = where(face,zz,Umu);
+
+	PokeIndex<LorentzIndex>(U, Umu, mu);
+      }
+    }
+  }
+  
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DomainDecomposition.h
+++ b/Grid/qcd/action/domains/DomainDecomposition.h
@@ -0,0 +1,187 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/domains/DomainDecomposition.h
+
+Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+////////////////////////////////////////////////////
+// Dirichlet filter with sub-block size B[mu]
+////////////////////////////////////////////////////
+#pragma once 
+
+
+NAMESPACE_BEGIN(Grid);
+
+
+struct DomainDecomposition
+{
+  Coordinate Block;
+  static constexpr RealD factor = 0.6;
+
+  DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);};
+  
+  template<class Field>
+  void ProjectDomain(Field &f,Integer domain)
+  {
+    GridBase *grid = f.Grid();
+    int dims = grid->Nd();
+    int isDWF= (dims==Nd+1);
+    assert((dims==Nd)||(dims==Nd+1));
+
+    Field   zz(grid);  zz = Zero();
+    LatticeInteger coor(grid);
+    LatticeInteger domaincoor(grid);
+    LatticeInteger mask(grid); mask = Integer(1);
+    LatticeInteger zi(grid);     zi = Integer(0);
+    for(int d=0;d<Nd;d++){
+      Integer B= Block[d];
+      if ( B ) {
+	LatticeCoordinate(coor,d+isDWF);
+	domaincoor = mod(coor,B);
+	mask = where(domaincoor==Integer(0),zi,mask);
+	mask = where(domaincoor==Integer(B-1),zi,mask);
+      }
+    }
+    if ( !domain )
+      f = where(mask==Integer(1),f,zz);
+    else 
+      f = where(mask==Integer(0),f,zz);
+  };
+  template<class GaugeField>
+  void ProjectDDHMC(GaugeField &U)
+  {
+    GridBase *grid = U.Grid();
+    Coordinate Global=grid->GlobalDimensions();
+    GaugeField zzz(grid); zzz = Zero();
+    LatticeInteger coor(grid); 
+
+    GaugeField Uorg(grid); Uorg = U;
+    
+    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
+    ////////////////////////////////////////////////////
+    // Zero BDY layers
+    ////////////////////////////////////////////////////
+    for(int mu=0;mu<Nd;mu++) {
+      Integer B1 = Block[mu];
+      if ( B1 && (B1 <= Global[mu]) ) {
+	LatticeCoordinate(coor,mu);
+
+
+	////////////////////////////////
+	// OmegaBar - zero all links contained in slice B-1,0 and
+	// mu links connecting to Omega
+	////////////////////////////////
+
+	U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
+	U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
+
+	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
+	PokeIndex<LorentzIndex>(U, U_mu, mu);
+
+      }
+    }
+   
+    ////////////////////////////////////////////
+    // Omega interior slow the evolution
+    // Tricky as we need to take the smallest of values imposed by each cut
+    // Do them in order or largest to smallest and smallest writes last
+    ////////////////////////////////////////////
+    RealD f= factor;
+#if 0    
+    for(int mu=0;mu<Nd;mu++) {
+      Integer B1 = Block[mu];
+      if ( B1 && (B1 <= Global[mu]) ) {
+
+	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
+	// In the plane
+	U = where(mod(coor,B1)==Integer(B1-5),Uorg*f,U); 
+	U = where(mod(coor,B1)==Integer(4)   ,Uorg*f,U); 
+
+	// Perp links
+       	U_mu = where(mod(coor,B1)==Integer(B1-6),Uorg_mu*f,U_mu);
+	U_mu = where(mod(coor,B1)==Integer(4)   ,Uorg_mu*f,U_mu);
+
+	PokeIndex<LorentzIndex>(U, U_mu, mu);
+      }
+    }
+#endif
+    for(int mu=0;mu<Nd;mu++) {
+      Integer B1 = Block[mu];
+      if ( B1 && (B1 <= Global[mu]) ) {
+
+	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
+	// In the plane
+	U = where(mod(coor,B1)==Integer(B1-4),Uorg*f*f,U); 
+	U = where(mod(coor,B1)==Integer(3)   ,Uorg*f*f,U); 
+
+	// Perp links
+       	U_mu = where(mod(coor,B1)==Integer(B1-5),Uorg_mu*f*f,U_mu);
+	U_mu = where(mod(coor,B1)==Integer(3)   ,Uorg_mu*f*f,U_mu);
+
+	PokeIndex<LorentzIndex>(U, U_mu, mu);
+      }
+    }
+    for(int mu=0;mu<Nd;mu++) {
+      Integer B1 = Block[mu];
+      if ( B1 && (B1 <= Global[mu]) ) {
+
+	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
+	// In the plane
+	U = where(mod(coor,B1)==Integer(B1-3),Uorg*f*f*f,U); 
+	U = where(mod(coor,B1)==Integer(2)   ,Uorg*f*f*f,U); 
+
+	// Perp links
+       	U_mu = where(mod(coor,B1)==Integer(B1-4),Uorg_mu*f*f*f,U_mu);
+	U_mu = where(mod(coor,B1)==Integer(2)   ,Uorg_mu*f*f*f,U_mu);
+
+	PokeIndex<LorentzIndex>(U, U_mu, mu);
+      }
+    }
+    for(int mu=0;mu<Nd;mu++) {
+      Integer B1 = Block[mu];
+      if ( B1 && (B1 <= Global[mu]) ) {
+
+	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
+	// In the plane
+	U = where(mod(coor,B1)==Integer(B1-2),zzz,U); 
+	U = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
+
+	// Perp links
+	U_mu = where(mod(coor,B1)==Integer(B1-3),Uorg_mu*f*f*f*f,U_mu);
+	U_mu = where(mod(coor,B1)==Integer(1)   ,Uorg_mu*f*f*f*f,U_mu);
+
+	PokeIndex<LorentzIndex>(U, U_mu, mu);
+      }
+    }
+  }
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/Domains.h
+++ b/Grid/qcd/action/domains/Domains.h
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/momentum/Domains.h
+
+Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+////////////////////////////////////////////////////
+// Dirichlet filter with sub-block size B[mu]
+////////////////////////////////////////////////////
+#pragma once 
+
+#include <Grid/qcd/action/domains/DomainDecomposition.h>
+#include <Grid/qcd/action/domains/MomentumFilter.h>
+#include <Grid/qcd/action/domains/DirichletFilter.h>
+#include <Grid/qcd/action/domains/DDHMCFilter.h>
+
--- a/Grid/qcd/action/domains/MomentumFilter.h
+++ b/Grid/qcd/action/domains/MomentumFilter.h
@@ -28,8 +28,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
-#ifndef MOMENTUM_FILTER
-#define MOMENTUM_FILTER
+#pragma once 

 NAMESPACE_BEGIN(Grid);

@@ -38,7 +37,6 @@ NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct MomentumFilterBase{
  virtual void applyFilter(MomentaField &P) const = 0;
-  virtual ~MomentumFilterBase(){};
 };

 //Do nothing
@@ -84,11 +82,10 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
    
  }

+
 };




 NAMESPACE_END(Grid);
-
-#endif
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -60,6 +60,8 @@ public:
  ///////////////////////////////////////////////////////////////
  virtual void Dminus(const FermionField &psi, FermionField &chi);
  virtual void DminusDag(const FermionField &psi, FermionField &chi);
+  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported);
+  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported);
  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
  virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
  virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
@@ -68,17 +70,9 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
-  RealD MassPlus(void) { return mass_plus; };
-  RealD MassMinus(void) { return mass_minus; };
-
+  RealD Mass(void) { return mass; };
  void  SetMass(RealD _mass) { 
-    mass_plus=mass_minus=_mass; 
-    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
-  } ;
-  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
-    mass_plus=_mass_plus;
-    mass_minus=_mass_minus;
+    mass=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@@ -116,7 +110,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);

  //    protected:
-  RealD mass_plus, mass_minus;
+  RealD mass;

  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
@@ -183,6 +177,16 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

+  void CayleyReport(void);
+  void CayleyZeroCounters(void);
+
+  double M5Dflops;
+  double M5Dcalls;
+  double M5Dtime;
+
+  double MooeeInvFlops;
+  double MooeeInvCalls;
+  double MooeeInvTime;

 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -1,334 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
-
-    Copyright (C) 2017 - 2022
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
-    Author: Mattia Bruno <mattia.bruno@cern.ch>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#pragma once
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
-
-////////////////////////////////////////////
-// Standard Clover
-//   (4+m0) + csw * clover_term
-// Exp Clover
-//   (4+m0) * exp(csw/(4+m0) clover_term)
-//   = (4+m0) + csw * clover_term + ...
-////////////////////////////////////////////
-
-NAMESPACE_BEGIN(Grid);
-
-
-//////////////////////////////////
-// Generic Standard Clover
-//////////////////////////////////
-
-template<class Impl>
-class CloverHelpers: public WilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-
-  typedef WilsonCloverHelpers<Impl> Helpers;
-
-  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
-    GridBase *grid = CloverTerm.Grid();
-    CloverTerm += diag_mass;
-
-    int lvol = grid->lSites();
-    int DimRep = Impl::Dimension;
-    {
-      autoView(CTv,CloverTerm,CpuRead);
-      autoView(CTIv,CloverTermInv,CpuWrite);
-      thread_for(site, lvol, {
-        Coordinate lcoor;
-        grid->LocalIndexToLocalCoor(site, lcoor);
-        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
-        peekLocalSite(Qx, CTv, lcoor);
-
-        for (int j = 0; j < Ns; j++)
-          for (int k = 0; k < Ns; k++)
-            for (int a = 0; a < DimRep; a++)
-              for (int b = 0; b < DimRep; b++){
-                auto zz =  Qx()(j, k)(a, b);
-                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-              }
-
-        EigenInvCloverOp = EigenCloverOp.inverse();
-        for (int j = 0; j < Ns; j++)
-          for (int k = 0; k < Ns; k++)
-            for (int a = 0; a < DimRep; a++)
-              for (int b = 0; b < DimRep; b++)
-                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-               pokeLocalSite(Qxinv, CTIv, lcoor);
-      });
-    }
-  }
-
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    return Helpers::Cmunu(U, lambda, mu, nu);
-  }
-
-};
-
-
-//////////////////////////////////
-// Generic Exp Clover
-//////////////////////////////////
-
-template<class Impl>
-class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-
-  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
-  typedef WilsonCloverHelpers<Impl> Helpers;
-
-  // Can this be avoided?
-  static void IdentityTimesC(const CloverField& in, RealD c) {
-    int DimRep = Impl::Dimension;
-
-    autoView(in_v, in, AcceleratorWrite);
-
-    accelerator_for(ss, in.Grid()->oSites(), 1, {
-      for (int sa=0; sa<Ns; sa++)
-        for (int ca=0; ca<DimRep; ca++)
-          in_v[ss]()(sa,sa)(ca,ca) = c;
-    });
-  }
-
-  static int getNMAX(RealD prec, RealD R) {
-    /* compute stop condition for exponential */
-    int NMAX=1;
-    RealD cond=R*R/2.;
-
-    while (cond*std::exp(R)>prec) {
-      NMAX++;
-      cond*=R/(double)(NMAX+1);
-    }
-    return NMAX;
-  }
-
-  static int getNMAX(Lattice<iImplClover<vComplexD2>> &t, RealD R) {return getNMAX(1e-12,R);}
-  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
-  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
-
-  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
-    GridBase* grid = Clover.Grid();
-    CloverField ExpClover(grid);
-
-    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
-
-    Clover *= (1.0/diag_mass);
-
-    // Taylor expansion, slow but generic
-    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
-    // qN = cN
-    // qn = cn + qn+1 X
-    std::vector<RealD> cn(NMAX+1);
-    cn[0] = 1.0;
-    for (int i=1; i<=NMAX; i++)
-      cn[i] = cn[i-1] / RealD(i);
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * Clover + cn[i];
-
-    // prepare inverse
-    CloverInv = (-1.0)*Clover;
-
-    Clover = ExpClover * diag_mass;
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * CloverInv + cn[i];
-
-    CloverInv = ExpClover * (1.0/diag_mass);
-
-  }
-
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    assert(0);
-    return lambda;
-  }
-
-};
-
-
-//////////////////////////////////
-// Compact Standard Clover
-//////////////////////////////////
-
-
-template<class Impl>
-class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
-                            public WilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-  INHERIT_COMPACT_CLOVER_TYPES(Impl);
-
-  typedef WilsonCloverHelpers<Impl> Helpers;
-  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
-
-  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
-    Clover += diag_mass;
-  }
-
-  static void InvertClover(CloverField& InvClover,
-                            const CloverDiagonalField& diagonal,
-                            const CloverTriangleField& triangle,
-                            CloverDiagonalField&       diagonalInv,
-                            CloverTriangleField&       triangleInv,
-                            bool fixedBoundaries) {
-
-    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
-  }
-
-  // TODO: implement Cmunu for better performances with compact layout, but don't do it
-  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    return Helpers::Cmunu(U, lambda, mu, nu);
-  }
-};
-
-//////////////////////////////////
-// Compact Exp Clover
-//////////////////////////////////
-
-template<class Impl>
-class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-  INHERIT_COMPACT_CLOVER_TYPES(Impl);
-
-  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
-  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
-
-  // Can this be avoided?
-  static void IdentityTimesC(const CloverField& in, RealD c) {
-    int DimRep = Impl::Dimension;
-
-    autoView(in_v, in, AcceleratorWrite);
-
-    accelerator_for(ss, in.Grid()->oSites(), 1, {
-      for (int sa=0; sa<Ns; sa++)
-        for (int ca=0; ca<DimRep; ca++)
-          in_v[ss]()(sa,sa)(ca,ca) = c;
-    });
-  }
-
-  static int getNMAX(RealD prec, RealD R) {
-    /* compute stop condition for exponential */
-    int NMAX=1;
-    RealD cond=R*R/2.;
-
-    while (cond*std::exp(R)>prec) {
-      NMAX++;
-      cond*=R/(double)(NMAX+1);
-    }
-    return NMAX;
-  }
-
-  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
-  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
-
-  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
-
-    GridBase* grid = Clover.Grid();
-    CloverField ExpClover(grid);
-
-    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
-
-    Clover *= (1.0/diag_mass);
-
-    // Taylor expansion, slow but generic
-    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
-    // qN = cN
-    // qn = cn + qn+1 X
-    std::vector<RealD> cn(NMAX+1);
-    cn[0] = 1.0;
-    for (int i=1; i<=NMAX; i++)
-      cn[i] = cn[i-1] / RealD(i);
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * Clover + cn[i];
-
-    // prepare inverse
-    CloverInv = (-1.0)*Clover;
-
-    Clover = ExpClover * diag_mass;
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * CloverInv + cn[i];
-
-    CloverInv = ExpClover * (1.0/diag_mass);
-
-  }
-
-  static void InvertClover(CloverField& InvClover,
-                            const CloverDiagonalField& diagonal,
-                            const CloverTriangleField& triangle,
-                            CloverDiagonalField&       diagonalInv,
-                            CloverTriangleField&       triangleInv,
-                            bool fixedBoundaries) {
-
-    if (fixedBoundaries)
-    {
-      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
-    }
-    else
-    {
-      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
-    }
-  }
-
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    assert(0);
-    return lambda;
-  }
-
-};
-
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@@ -1,241 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
-
-    Copyright (C) 2020 - 2022
-
-    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
-    Author: Nils Meyer <nils.meyer@ur.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#pragma once
-
-#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
-#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
-#include <Grid/qcd/action/fermion/CloverHelpers.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
-//
-// Modifications done here:
-//
-// Original: clover term = 12x12 matrix per site
-//
-// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
-// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
-// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
-//
-// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
-// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
-//
-// Words per site and improvement compared to original (combined with the input and output spinors):
-//
-// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
-// - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
-// - Here:     2*12 + 42    =  66 words -> 2.55 x less
-//
-// These improvements directly translate to wall-clock time
-//
-// Data layout:
-//
-// - diagonal and triangle part as separate lattice fields,
-//   this was faster than as 1 combined field on all tested machines
-// - diagonal: as expected
-// - triangle: store upper right triangle in row major order
-// - graphical:
-//        0  1  2  3  4
-//           5  6  7  8
-//              9 10 11 = upper right triangle indices
-//                12 13
-//                   14
-//     0
-//        1
-//           2
-//              3       = diagonal indices
-//                 4
-//                    5
-//     0
-//     1  5
-//     2  6  9          = lower left triangle indices
-//     3  7 10 12
-//     4  8 11 13 14
-//
-// Impact on total memory consumption:
-// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
-// - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
-//           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
-//                                                                 =  84 complex words per site
-
-template<class Impl, class CloverHelpers>
-class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
-                                   public WilsonCloverHelpers<Impl>,
-                                   public CompactWilsonCloverHelpers<Impl> {
-  /////////////////////////////////////////////
-  // Sizes
-  /////////////////////////////////////////////
-
-public:
-
-  INHERIT_COMPACT_CLOVER_SIZES(Impl);
-
-  /////////////////////////////////////////////
-  // Type definitions
-  /////////////////////////////////////////////
-
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-  INHERIT_COMPACT_CLOVER_TYPES(Impl);
-
-  typedef WilsonFermion<Impl>              WilsonBase;
-  typedef WilsonCloverHelpers<Impl>        Helpers;
-  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
-
-  /////////////////////////////////////////////
-  // Constructors
-  /////////////////////////////////////////////
-
-public:
-
-  CompactWilsonCloverFermion(GaugeField& _Umu,
-			    GridCartesian& Fgrid,
-			    GridRedBlackCartesian& Hgrid,
-			    const RealD _mass,
-			    const RealD _csw_r = 0.0,
-			    const RealD _csw_t = 0.0,
-			    const RealD _cF = 1.0,
-			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
-			    const ImplParams& impl_p = ImplParams());
-
-  /////////////////////////////////////////////
-  // Member functions (implementing interface)
-  /////////////////////////////////////////////
-
-public:
-
-  virtual void Instantiatable() {};
-  int          ConstEE()     override { return 0; };
-  int          isTrivialEE() override { return 0; };
-
-  void Dhop(const FermionField& in, FermionField& out, int dag) override;
-
-  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
-
-  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
-
-  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
-
-  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
-
-  void M(const FermionField& in, FermionField& out) override;
-
-  void Mdag(const FermionField& in, FermionField& out) override;
-
-  void Meooe(const FermionField& in, FermionField& out) override;
-
-  void MeooeDag(const FermionField& in, FermionField& out) override;
-
-  void Mooee(const FermionField& in, FermionField& out) override;
-
-  void MooeeDag(const FermionField& in, FermionField& out) override;
-
-  void MooeeInv(const FermionField& in, FermionField& out) override;
-
-  void MooeeInvDag(const FermionField& in, FermionField& out) override;
-
-  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
-
-  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
-
-  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
-
-  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
-
-  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
-
-  /////////////////////////////////////////////
-  // Member functions (internals)
-  /////////////////////////////////////////////
-
-  void MooeeInternal(const FermionField&        in,
-                     FermionField&              out,
-                     const CloverDiagonalField& diagonal,
-                     const CloverTriangleField& triangle);
-
-  /////////////////////////////////////////////
-  // Helpers
-  /////////////////////////////////////////////
-
-  void ImportGauge(const GaugeField& _Umu) override;
-
-  /////////////////////////////////////////////
-  // Helpers
-  /////////////////////////////////////////////
-
-private:
-
-  template<class Field>
-  const MaskField* getCorrectMaskField(const Field &in) const {
-    if(in.Grid()->_isCheckerBoarded) {
-      if(in.Checkerboard() == Odd) {
-        return &this->BoundaryMaskOdd;
-      } else {
-        return &this->BoundaryMaskEven;
-      }
-    } else {
-      return &this->BoundaryMask;
-    }
-  }
-
-  template<class Field>
-  void ApplyBoundaryMask(Field& f) {
-    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
-    assert(m != nullptr);
-    CompactHelpers::ApplyBoundaryMask(f, *m);
-  }
-
-  /////////////////////////////////////////////
-  // Member Data
-  /////////////////////////////////////////////
-
-public:
-
-  RealD csw_r;
-  RealD csw_t;
-  RealD cF;
-
-  bool fixedBoundaries;
-
-  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
-  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
-
-  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
-  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
-
-  FermionField Tmp;
-
-  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
-};
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DWFSlow.h
+++ b/Grid/qcd/action/fermion/DWFSlow.h
@@ -1,291 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DWFSlow.h
-
-Copyright (C) 2022
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#pragma once
-
-NAMESPACE_BEGIN(Grid);
-
-template <class Impl>
-class DWFSlowFermion : public FermionOperator<Impl>
-{
-public:
-  INHERIT_IMPL_TYPES(Impl);
-
-  ///////////////////////////////////////////////////////////////
-  // Implement the abstract base
-  ///////////////////////////////////////////////////////////////
-  GridBase *GaugeGrid(void) { return _grid4; }
-  GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; }
-  GridBase *FermionGrid(void) { return _grid; }
-  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
-
-  FermionField _tmp;
-  FermionField &tmp(void) { return _tmp; }
-
-  //////////////////////////////////////////////////////////////////
-  // override multiply; cut number routines if pass dagger argument
-  // and also make interface more uniformly consistent
-  //////////////////////////////////////////////////////////////////
-  virtual void  M(const FermionField &in, FermionField &out)
-  {
-    FermionField tmp(_grid);
-    out = (5.0 - M5) * in;
-    Dhop(in,tmp,DaggerNo);
-    out = out + tmp;
-  }
-  virtual void  Mdag(const FermionField &in, FermionField &out)
-  {
-    FermionField tmp(_grid);
-    out = (5.0 - M5) * in;
-    Dhop(in,tmp,DaggerYes);
-    out = out + tmp;
-  };
-
-  /////////////////////////////////////////////////////////
-  // half checkerboard operations 5D redblack so just site identiy
-  /////////////////////////////////////////////////////////
-  void Meooe(const FermionField &in, FermionField &out)
-  {
-    if ( in.Checkerboard() == Odd ) {
-      this->DhopEO(in,out,DaggerNo);
-    } else {
-      this->DhopOE(in,out,DaggerNo);
-    }
-  }
-  void MeooeDag(const FermionField &in, FermionField &out)
-  {
-    if ( in.Checkerboard() == Odd ) {
-      this->DhopEO(in,out,DaggerYes);
-    } else {
-      this->DhopOE(in,out,DaggerYes);
-    }
-  };
-
-  // allow override for twisted mass and clover
-  virtual void Mooee(const FermionField &in, FermionField &out)
-  {
-    out = (5.0 - M5) * in;
-  }
-  virtual void MooeeDag(const FermionField &in, FermionField &out)
-  {
-    out = (5.0 - M5) * in;
-  }
-  virtual void MooeeInv(const FermionField &in, FermionField &out)
-  {
-    out = (1.0/(5.0 - M5)) * in;
-  };
-  virtual void MooeeInvDag(const FermionField &in, FermionField &out)
-  {
-    out = (1.0/(5.0 - M5)) * in;
-  };
-
-  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) {} ;
-
-  ////////////////////////
-  // Derivative interface
-  ////////////////////////
-  // Interface calls an internal routine
-  void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)  { assert(0);};
-  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
-  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
-
-  ///////////////////////////////////////////////////////////////
-  // non-hermitian hopping term; half cb or both
-  ///////////////////////////////////////////////////////////////
-  void Dhop(const FermionField &in, FermionField &out, int dag)
-  {
-    FermionField tmp(in.Grid());
-    Dhop5(in,out,MassField,MassField,dag );
-    for(int mu=0;mu<4;mu++){
-      DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag );    out = out + tmp;
-    }
-  };
-  void DhopOE(const FermionField &in, FermionField &out, int dag)
-  {
-    FermionField tmp(in.Grid());
-    assert(in.Checkerboard()==Even);
-    Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
-    for(int mu=0;mu<4;mu++){
-      DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag );    out = out + tmp;
-    }
-  };
-  void DhopEO(const FermionField &in, FermionField &out, int dag)
-  {
-    FermionField tmp(in.Grid());
-    assert(in.Checkerboard()==Odd);
-    Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );  
-    for(int mu=0;mu<4;mu++){
-      DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag );    out = out + tmp;
-    }
-  };
-
-  ///////////////////////////////////////////////////////////////
-  // Multigrid assistance; force term uses too
-  ///////////////////////////////////////////////////////////////
-  void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
-  void MdirAll(const FermionField &in, std::vector<FermionField> &out)   { assert(0);};
-  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
-  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out)    { assert(0);};
-  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
-
-  void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
-  {
-    RealD     sgn= 1.0;
-    if (dag ) sgn=-1.0;
-
-    Gamma::Algebra Gmu [] = {
-			 Gamma::Algebra::GammaX,
-			 Gamma::Algebra::GammaY,
-			 Gamma::Algebra::GammaZ,
-			 Gamma::Algebra::GammaT
-    };
-
-    //    mass is  1,1,1,1,-m has to multiply the round the world term
-    FermionField tmp (in.Grid());
-    tmp = U5e * Cshift(in,mu+1,1);
-    out = tmp - Gamma(Gmu[mu])*tmp*sgn;
-    
-    tmp = Cshift(adj(U5o)*in,mu+1,-1);
-    out = out + tmp + Gamma(Gmu[mu])*tmp*sgn;
-
-    out = -0.5*out;
-  };
-
-  void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag)
-  {
-    // Mass term.... must multiple the round world with mass = 1,1,1,1, -m
-    RealD     sgn= 1.0;
-    if (dag ) sgn=-1.0;
-
-    Gamma G5(Gamma::Algebra::Gamma5);
-
-    FermionField tmp (in.Grid());
-    tmp = massE*Cshift(in,0,1);
-    out = tmp - G5*tmp*sgn;
-    
-    tmp = Cshift(massO*in,0,-1);
-    out = out + tmp + G5*tmp*sgn;
-    out = -0.5*out;
-  };
-
-  // Constructor
-  DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid,
-		 GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5)
-    :
-    _grid(&Fgrid),
-    _cbgrid(&Hgrid),
-    _grid4(_Umu.Grid()),
-    Umu(Nd,&Fgrid),
-    UmuEven(Nd,&Hgrid),
-    UmuOdd(Nd,&Hgrid),
-    MassField(&Fgrid),
-    MassFieldEven(&Hgrid),
-    MassFieldOdd(&Hgrid),
-    M5(_M5),
-    mass(_mass),
-    _tmp(&Hgrid)
-    {
-      Ls=Fgrid._fdimensions[0];
-      ImportGauge(_Umu);
-
-      typedef typename FermionField::scalar_type scalar;
-
-      Lattice<iScalar<vInteger> > coor(&Fgrid);
-      LatticeCoordinate(coor, 0); // Scoor
-      ComplexField one(&Fgrid);
-      MassField =scalar(-mass);
-      one       =scalar(1.0);
-      MassField =where(coor==Integer(Ls-1),MassField,one);
-      for(int mu=0;mu<Nd;mu++){
-	pickCheckerboard(Even,UmuEven[mu],Umu[mu]);
-	pickCheckerboard(Odd ,UmuOdd[mu],Umu[mu]);
-      }
-      pickCheckerboard(Even,MassFieldEven,MassField);
-      pickCheckerboard(Odd ,MassFieldOdd,MassField);
-      
-    }
-  
-  // DoubleStore impl dependent
-  void ImportGauge(const GaugeField &_Umu4)
-  {
-    GaugeLinkField U4(_grid4);
-    for(int mu=0;mu<Nd;mu++){
-      U4 = PeekIndex<LorentzIndex>(_Umu4, mu);
-      for(int s=0;s<this->Ls;s++){
-	InsertSlice(U4,Umu[mu],s,0);
-      }
-    }
-  }
-
-  ///////////////////////////////////////////////////////////////
-  // Data members require to support the functionality
-  ///////////////////////////////////////////////////////////////
-
-public:
-  virtual RealD Mass(void) { return mass; }
-  virtual int   isTrivialEE(void) { return 1; };
-  RealD mass;
-  RealD M5;
-  int Ls;
-
-  GridBase *_grid4;
-  GridBase *_grid;
-  GridBase *_cbgrid4;
-  GridBase *_cbgrid;
-
-  // Copy of the gauge field , with even and odd subsets
-  std::vector<GaugeLinkField> Umu;
-  std::vector<GaugeLinkField> UmuEven;
-  std::vector<GaugeLinkField> UmuOdd;
-  ComplexField MassField;
-  ComplexField MassFieldEven;
-  ComplexField MassFieldOdd;
-
-  ///////////////////////////////////////////////////////////////
-  // Conserved current utilities
-  ///////////////////////////////////////////////////////////////
-  void ContractConservedCurrent(PropagatorField &q_in_1,
-                                PropagatorField &q_in_2,
-                                PropagatorField &q_out,
-                                PropagatorField &phys_src,
-                                Current curr_type,
-                                unsigned int mu){}
-  void SeqConservedCurrent(PropagatorField &q_in,
-                           PropagatorField &q_out,
-                           PropagatorField &phys_src,
-                           Current curr_type,
-                           unsigned int mu,
-                           unsigned int tmin,
-			   unsigned int tmax,
-			   ComplexField &lattice_cmplx){}
-};
-
-typedef DWFSlowFermion<WilsonImplF> DWFSlowFermionF;
-typedef DWFSlowFermion<WilsonImplD> DWFSlowFermionD;
-
-NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DirichletFermionOperator.h
+++ b/Grid/qcd/action/fermion/DirichletFermionOperator.h
@@ -0,0 +1,185 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/DirichletFermionOperator.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////
+// Wrap a fermion operator in Dirichlet BC's at node boundary
+////////////////////////////////////////////////////////////////
+    
+template<class Impl>
+class DirichletFermionOperator : public FermionOperator<Impl>
+{
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+
+  // Data members
+  int CommsMode;
+  Coordinate Block;
+  DirichletFilter<GaugeField> Filter;
+  FermionOperator<Impl> & FermOp;
+  
+  // Constructor / bespoke
+  DirichletFermionOperator(FermionOperator<Impl> & _FermOp, Coordinate &_Block)
+    : FermOp(_FermOp), Block(_Block), Filter(Block)
+  {
+    // Save what the comms mode should be under normal BCs
+    CommsMode = WilsonKernelsStatic::Comms;
+    assert((CommsMode == WilsonKernelsStatic::CommsAndCompute)
+         ||(CommsMode == WilsonKernelsStatic::CommsThenCompute));
+
+    // Check the block size divides local lattice
+    GridBase *grid = FermOp.GaugeGrid();
+
+    int blocks_per_rank = 1;
+    Coordinate LocalDims = grid->LocalDimensions();
+    Coordinate GlobalDims= grid->GlobalDimensions();
+    assert(Block.size()==LocalDims.size());
+
+    for(int d=0;d<LocalDims.size();d++){
+      if (Block[d]&&(Block[d]<=GlobalDims[d])){
+	int r = LocalDims[d] % Block[d];
+	assert(r == 0);
+	blocks_per_rank *= (LocalDims[d] / Block[d]);
+      }
+    }
+    // Even blocks per node required // could be relaxed but inefficient use of hardware as idle nodes in boundary operator R
+    assert( blocks_per_rank != 0);
+
+    // Possible checks that SIMD lanes are used with full occupancy???
+  };
+  virtual ~DirichletFermionOperator(void) = default;
+
+  void DirichletOn(void)   {
+    assert(WilsonKernelsStatic::Comms!= WilsonKernelsStatic::CommsDirichlet);
+    //    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsDirichlet;
+  }
+  void DirichletOff(void)  {
+    //    assert(WilsonKernelsStatic::Comms== WilsonKernelsStatic::CommsDirichlet);
+    //    WilsonKernelsStatic::Comms = CommsMode;
+  }
+
+  // Implement the full interface
+  virtual FermionField &tmp(void) { return FermOp.tmp(); };
+
+  virtual GridBase *FermionGrid(void)         { return FermOp.FermionGrid(); }
+  virtual GridBase *FermionRedBlackGrid(void) { return FermOp.FermionRedBlackGrid(); }
+  virtual GridBase *GaugeGrid(void)           { return FermOp.GaugeGrid(); }
+  virtual GridBase *GaugeRedBlackGrid(void)   { return FermOp.GaugeRedBlackGrid(); }
+  
+  // override multiply
+  virtual void  M    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.M(in,out);    DirichletOff();  };
+  virtual void  Mdag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mdag(in,out); DirichletOff();  };
+
+  // half checkerboard operaions
+  virtual void   Meooe       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Meooe(in,out);    DirichletOff(); };  
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MeooeDag(in,out); DirichletOff(); };
+  virtual void   Mooee       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mooee(in,out);    DirichletOff(); };
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeDag(in,out); DirichletOff(); };
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInv(in,out); DirichletOff(); };
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInvDag(in,out); DirichletOff(); };
+
+  // non-hermitian hopping term; half cb or both
+  virtual void Dhop  (const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.Dhop(in,out,dag);    DirichletOff(); };
+  virtual void DhopOE(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopOE(in,out,dag);  DirichletOff(); };
+  virtual void DhopEO(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopEO(in,out,dag);  DirichletOff(); };
+  virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp) { DirichletOn(); FermOp.DhopDir(in,out,dir,disp);  DirichletOff(); };
+
+  // force terms; five routines; default to Dhop on diagonal
+  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MDeriv(mat,U,V,dag);};
+  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MoeDeriv(mat,U,V,dag);};
+  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeoDeriv(mat,U,V,dag);};
+  virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MooDeriv(mat,U,V,dag);};
+  virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeeDeriv(mat,U,V,dag);};
+
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDeriv(mat,U,V,dag);};
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivEO(mat,U,V,dag);};
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivOE(mat,U,V,dag);};
+
+  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp){FermOp.Mdir(in,out,dir,disp);};
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)    {FermOp.MdirAll(in,out);};
+
+  ///////////////////////////////////////////////
+  // Updates gauge field during HMC
+  ///////////////////////////////////////////////
+  DoubledGaugeField &GetDoubledGaugeField(void){ return FermOp.GetDoubledGaugeField(); };
+  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return FermOp.GetDoubledGaugeFieldE(); };
+  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return FermOp.GetDoubledGaugeFieldO(); };
+  virtual void ImportGauge(const GaugeField & _U)
+  {
+    GaugeField U = _U;
+    // Filter gauge field to apply Dirichlet
+    Filter.applyFilter(U);
+    FermOp.ImportGauge(U);
+  }
+  ///////////////////////////////////////////////
+  // Physical field import/export
+  ///////////////////////////////////////////////
+  virtual void Dminus(const FermionField &psi, FermionField &chi)    { FermOp.Dminus(psi,chi); }
+  virtual void DminusDag(const FermionField &psi, FermionField &chi) { FermOp.DminusDag(psi,chi); }
+  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)   { FermOp.ImportFourDimPseudoFermion(input,imported);}
+  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported){ FermOp.ExportFourDimPseudoFermion(solution,exported);}
+  virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)  { FermOp.ImportPhysicalFermionSource(input,imported);}
+  virtual void ImportUnphysicalFermion(const FermionField &input,FermionField &imported)      { FermOp.ImportUnphysicalFermion(input,imported);}
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported) {FermOp.ExportPhysicalFermionSolution(solution,exported);}
+  virtual void ExportPhysicalFermionSource(const FermionField &solution,FermionField &exported)   {FermOp.ExportPhysicalFermionSource(solution,exported);}
+  //////////////////////////////////////////////////////////////////////
+  // Should never be used
+  //////////////////////////////////////////////////////////////////////
+  virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {assert(0);}
+  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { assert(0);}
+  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
+					PropagatorField &q_in_2,
+					PropagatorField &q_out,
+					PropagatorField &phys_src,
+					Current curr_type,
+					unsigned int mu)
+  {assert(0);};
+  virtual void SeqConservedCurrent(PropagatorField &q_in, 
+				   PropagatorField &q_out,
+				   PropagatorField &phys_src,
+				   Current curr_type,
+				   unsigned int mu,
+				   unsigned int tmin, 
+				   unsigned int tmax,
+				   ComplexField &lattice_cmplx)
+  {assert(0);};
+      // Only reimplemented in Wilson5D 
+      // Default to just a zero correlation function
+  virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=Zero(); };
+  virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=Zero(); };
+  
+};
+
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -47,14 +47,12 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/DWFSlow.h>       // Slow DWF

 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
-#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -103,6 +101,12 @@ NAMESPACE_CHECK(WilsonTM5);
 #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
 #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 #include <Grid/qcd/action/fermion/MADWF.h>
+////////////////////////////////////////////////////////////////////
+// DDHMC related 
+////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/DirichletFermionOperator.h>
+#include <Grid/qcd/action/fermion/SchurFactoredFermionOperator.h>
+
 NAMESPACE_CHECK(DWFutils);

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -113,161 +117,183 @@ NAMESPACE_CHECK(DWFutils);
 // Cayley 5d
 NAMESPACE_BEGIN(Grid);

-typedef WilsonFermion<WilsonImplD2> WilsonFermionD2;
+typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;

+//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
+//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
+//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
+
+typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
 typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;

+typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;

+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;

-// Sp(2n)
-typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF;
-typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD;
-
-typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF;
-typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD;
-
-typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF;
-typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD;
-
 // Twisted mass fermion
-typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
+typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;

 // Clover fermions
-template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
-template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;

-typedef WilsonClover<WilsonImplD2> WilsonCloverFermionD2;
-typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
-typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;

-typedef WilsonExpClover<WilsonImplD2> WilsonExpCloverFermionD2;
-typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
-typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;

-typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
-
-typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
-typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
-
-typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
-
-// Compact Clover fermions
-template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
-template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
-
-typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
-typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
-typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
-
-typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
-typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
-typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
-
-typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
-typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
-
-typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
-typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
-
-typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;

 // Domain Wall fermions
+typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
-typedef DomainWallFermion<WilsonImplD2> DomainWallFermionD2;

-typedef DomainWallEOFAFermion<WilsonImplD2> DomainWallEOFAFermionD2;
+//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
+//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
+//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
+
+typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;

-typedef MobiusFermion<WilsonImplD2> MobiusFermionD2;
+//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
+//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
+//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
+
+typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;

-typedef MobiusEOFAFermion<WilsonImplD2> MobiusEOFAFermionD2;
+//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
+//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
+//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
+
+typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;

-typedef ZMobiusFermion<ZWilsonImplD2> ZMobiusFermionD2;
+//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
+//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
+//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
+
+typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;

-typedef ScaledShamirFermion<WilsonImplD2> ScaledShamirFermionD2;
+//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
+//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
+//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
+
+// Ls vectorised
+typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;

-typedef MobiusZolotarevFermion<WilsonImplD2> MobiusZolotarevFermionD2;
+typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
 typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
 typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
-typedef ShamirZolotarevFermion<WilsonImplD2> ShamirZolotarevFermionD2;
+typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
 typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
 typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;

-typedef OverlapWilsonCayleyTanhFermion<WilsonImplD2> OverlapWilsonCayleyTanhFermionD2;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
-typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD2> OverlapWilsonCayleyZolotarevFermionD2;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;

 // Continued fraction
-typedef OverlapWilsonContFracTanhFermion<WilsonImplD2> OverlapWilsonContFracTanhFermionD2;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
-typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD2> OverlapWilsonContFracZolotarevFermionD2;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;

 // Partial fraction
-typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD2> OverlapWilsonPartialFractionTanhFermionD2;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;

-typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD2> OverlapWilsonPartialFractionZolotarevFermionD2;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;

 // Gparity cases; partial list until tested
+typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;

+//typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
+//typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
+//typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
+
+typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;

-typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionD2;
+//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
+//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
+//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
+
+typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;

-typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionD2;
+//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
+//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
+//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
+
+typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;

-typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionD2;
+//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
+//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
+//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
+
+typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;

-typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionD2;
+//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
+//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
+//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
+
+typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;

+//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
+//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
+//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
+
+typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;

+typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;

+typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -25,8 +25,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_CORE_H
-#define  GRID_QCD_FERMION_CORE_H
+#pragma once

 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
@@ -45,4 +44,3 @@ NAMESPACE_CHECK(FermionOperator);
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
 NAMESPACE_CHECK(Kernels);

-#endif
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -49,8 +49,6 @@ public:

  virtual FermionField &tmp(void) = 0;

-  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
-  
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

@@ -142,6 +140,9 @@ public:
  // Updates gauge field during HMC
  ///////////////////////////////////////////////
  virtual void ImportGauge(const GaugeField & _U)=0;
+  virtual DoubledGaugeField &GetDoubledGaugeField(void)  =0;
+  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void)  =0;
+  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void)  =0;

  //////////////////////////////////////////////////////////////////////
  // Conserved currents, either contract at sink or insert sequentially.
@@ -173,6 +174,16 @@ public:
      ///////////////////////////////////////////////
      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
+
+      virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)
+      {
+	imported = input;
+      };
+      virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported)
+      {
+	exported=solution;
+      };
+
      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
      {
 	imported = input;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -47,6 +47,18 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -129,8 +141,11 @@ public:
  void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-  DoubledGaugeField &GetU(void)   { return Umu ; } ;
-  DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
+  virtual DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  virtual DoubledGaugeField &GetUUU(void) { return UUUmu; };
  void CopyGaugeCheckerboards(void);

  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -52,6 +52,18 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -148,17 +160,20 @@ public:
 			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
 			     const ImplParams &p= ImplParams());
    
-    // DoubleStore gauge field in operator
-    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+  // DoubleStore gauge field in operator
+  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
  void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
-    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
-    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-    // Give a reference; can be used to do an assignment or copy back out after import
-    // if Carleton wants to cache them and not use the ImportSimple
-    DoubledGaugeField &GetU(void)   { return Umu ; } ;
-    DoubledGaugeField &GetUUU(void) { return UUUmu; };
-    void CopyGaugeCheckerboards(void);
-    
+  void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
+  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+  // Give a reference; can be used to do an assignment or copy back out after import
+  // if Carleton wants to cache them and not use the ImportSimple
+  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
+  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  void CopyGaugeCheckerboards(void);
+  
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -47,6 +47,18 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -123,6 +135,9 @@ public:

  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_U );
+  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
+  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
+  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  void CopyGaugeCheckerboards(void);

--- a/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
+++ b/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
@@ -0,0 +1,534 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/SchurFactoredFermionOperator.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
+#include <Grid/qcd/action/domains/Domains.h>
+
+NAMESPACE_BEGIN(Grid);
+
+  ////////////////////////////////////////////////////////
+  // Some explanation of class structure for domain decomposition:
+  //
+  // Need a dirichlet operator for two flavour determinant - acts on both Omega and OmegaBar.
+  //
+  // Possible gain if the global sums and CG are run independently?? Could measure this.
+  //
+  // Types of operations
+  //
+  // 1) assemble local det dOmega det dOmegaBar pseudofermion
+  //
+  // - DirichletFermionOperator - can either do a global solve, or independent/per cell coefficients.
+  //
+  // 2) assemble dOmegaInverse and dOmegaBarInverse in R
+  //
+  // - DirichletFermionOperator - can also be used to 
+  //                                       - need two or more cells per node. Options
+  //                                       - a) solve one cell at a time, no new code, CopyRegion and reduced /split Grids
+  //                                       - b) solve multiple cells in parallel. predicated dslash implementation
+  //
+  //                                       - b) has more parallelism, experience with block solver suggest might not be aalgorithmically inefficient
+  //                                         a) has more cache friendly and easier code.
+  //                                         b) is easy to implement in a "trial" or inefficient code with projection.
+  //
+  // 3)  Additional functionality for domain operations
+  //
+  // - SchurFactoredFermionOperator  - Need a DDHMC utility - whether used in two flavour or one flavour 
+  //
+  // - dBoundary - needs non-dirichlet operator
+  // - Contains one Dirichlet Op, and one non-Dirichlet op. Implements dBoundary etc...
+  // - The Dirichlet ops can be passed to dOmega(Bar) solvers etc...
+  //
+  ////////////////////////////////////////////////////////
+
+
+template<class ImplD,class ImplF>
+class SchurFactoredFermionOperator : public ImplD
+{
+  INHERIT_IMPL_TYPES(ImplD);
+  
+  typedef typename ImplF::FermionField FermionFieldF;
+  typedef typename ImplD::FermionField FermionFieldD;
+
+  typedef SchurDiagMooeeOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorD;
+  typedef SchurDiagMooeeOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorDagD;
+  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorDagF;
+
+  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
+							  FermionOperator<ImplF>,
+							  LinearOperatorD,
+							  LinearOperatorF> MxPCG;
+
+  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
+							  FermionOperator<ImplF>,
+							  LinearOperatorDagD,
+							  LinearOperatorDagF> MxDagPCG;
+public:
+
+  GridBase *FermionGrid(void) { return PeriodicFermOpD.FermionGrid(); };
+  GridBase *GaugeGrid(void)   { return PeriodicFermOpD.GaugeGrid(); };
+  
+  FermionOperator<ImplD> & DirichletFermOpD;
+  FermionOperator<ImplF> & DirichletFermOpF;
+  FermionOperator<ImplD> & PeriodicFermOpD; 
+  FermionOperator<ImplF> & PeriodicFermOpF; 
+
+  LinearOperatorD DirichletLinOpD;
+  LinearOperatorF DirichletLinOpF;
+  LinearOperatorD PeriodicLinOpD;
+  LinearOperatorF PeriodicLinOpF;
+
+  LinearOperatorDagD DirichletLinOpDagD;
+  LinearOperatorDagF DirichletLinOpDagF;
+  LinearOperatorDagD PeriodicLinOpDagD;
+  LinearOperatorDagF PeriodicLinOpDagF;
+
+  // Can tinker with these in the pseudofermion for force vs. action solves
+  Integer maxinnerit;
+  Integer maxouterit;
+  RealD tol;
+  RealD tolinner;
+  
+  Coordinate Block;
+
+  DomainDecomposition Domains;
+
+  SchurFactoredFermionOperator(FermionOperator<ImplD>  & _PeriodicFermOpD,
+			       FermionOperator<ImplF>  & _PeriodicFermOpF,
+			       FermionOperator<ImplD>  & _DirichletFermOpD,
+			       FermionOperator<ImplF>  & _DirichletFermOpF,
+			       Coordinate &_Block)
+    : Block(_Block), Domains(Block),
+
+      PeriodicFermOpD(_PeriodicFermOpD),
+      PeriodicFermOpF(_PeriodicFermOpF),
+      DirichletFermOpD(_DirichletFermOpD),
+      DirichletFermOpF(_DirichletFermOpF),
+      DirichletLinOpD(DirichletFermOpD),
+      DirichletLinOpF(DirichletFermOpF),
+      PeriodicLinOpD(PeriodicFermOpD),
+      PeriodicLinOpF(PeriodicFermOpF),
+      DirichletLinOpDagD(DirichletFermOpD),
+      DirichletLinOpDagF(DirichletFermOpF),
+      PeriodicLinOpDagD(PeriodicFermOpD),
+      PeriodicLinOpDagF(PeriodicFermOpF)
+  {
+    tol=1.0e-10;
+    tolinner=1.0e-6;
+    maxinnerit=1000;
+    maxouterit=10;
+    assert(PeriodicFermOpD.FermionGrid() == DirichletFermOpD.FermionGrid());
+    assert(PeriodicFermOpF.FermionGrid() == DirichletFermOpF.FermionGrid());
+  };
+
+  enum Domain { Omega=0, OmegaBar=1 };
+
+  void ImportGauge(const GaugeField &Umu)
+  {
+    // Single precision will update in the mixed prec CG
+    PeriodicFermOpD.ImportGauge(Umu);
+    GaugeField dUmu(Umu.Grid());
+    dUmu=Umu;
+    //    DirchletBCs(dUmu);
+    DirichletFilter<GaugeField> Filter(Block);
+    Filter.applyFilter(dUmu);
+    DirichletFermOpD.ImportGauge(dUmu);
+  }
+
+/*
+  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
+  {
+    assert((sgn==1)||(sgn==-1));
+    Real rsgn = sgn;
+
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
+
+    GridBase *grid = f.Grid();
+    LatticeInteger  coor(grid);
+    LatticeInteger  face(grid);
+    LatticeInteger  one(grid); one = 1;
+    LatticeInteger  zero(grid); zero = 0;
+    LatticeInteger nface(grid); nface=Zero();
+    
+    FermionField projected(grid); projected=Zero();
+    FermionField sp_proj  (grid);
+
+    int dims = grid->Nd();
+    int isDWF= (dims==Nd+1);
+    assert((dims==Nd)||(dims==Nd+1));
+    Coordinate Global=grid->GlobalDimensions();
+
+    for(int mu=0;mu<Nd;mu++){
+
+      if ( Block[mu] <= Global[mu+isDWF] ) {
+	// need to worry about DWF 5th dim first
+	LatticeCoordinate(coor,mu+isDWF); 
+      
+	face = where(mod(coor,Block[mu]) == Integer(0),one,zero );
+	nface = nface + face;
+
+	Gamma G(Gmu[mu]);
+	// Lower face receives (1-gamma)/2 in normal forward hopping term
+	sp_proj  = 0.5*(f-G*f*rsgn);
+	projected= where(face,sp_proj,projected);
+	//projected= where(face,f,projected);
+      
+	face = where(mod(coor,Block[mu]) == Integer(Block[mu]-1) ,one,zero );
+	nface = nface + face;
+
+	// Upper face receives (1+gamma)/2 in normal backward hopping term
+	sp_proj = 0.5*(f+G*f*rsgn);
+	projected= where(face,sp_proj,projected);
+	//projected= where(face,f,projected);
+      }
+      
+    }
+    // Initial Zero() where nface==0.
+    // Keep the spin projected faces where nface==1
+    // Full spinor where nface>=2
+    projected = where(nface>Integer(1),f,projected);
+    f=projected;
+  }
+*/
+  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
+  {
+    assert((sgn==1)||(sgn==-1));
+    Real rsgn = sgn;
+
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
+
+    GridBase *grid = f.Grid();
+    LatticeInteger  coor(grid);
+    LatticeInteger  face(grid);
+    LatticeInteger  one(grid);   one = 1;
+    LatticeInteger  zero(grid); zero = 0;
+    LatticeInteger  omega(grid);
+    LatticeInteger  omegabar(grid);
+    LatticeInteger  tmp(grid);
+
+    omega=one;    Domains.ProjectDomain(omega,0);
+    omegabar=one; Domains.ProjectDomain(omegabar,1);
+    
+    LatticeInteger nface(grid); nface=Zero();
+    
+    FermionField projected(grid); projected=Zero();
+    FermionField sp_proj  (grid);
+
+    int dims = grid->Nd();
+    int isDWF= (dims==Nd+1);
+    assert((dims==Nd)||(dims==Nd+1));
+    Coordinate Global=grid->GlobalDimensions();
+
+    for(int mmu=0;mmu<Nd;mmu++){
+      Gamma G(Gmu[mmu]);
+
+      // need to worry about DWF 5th dim first
+      int mu = mmu+isDWF;
+      if ( Block[mmu] && (Block[mmu] <= Global[mu]) ) {
+
+	// Lower face receives (1-gamma)/2 in normal forward hopping term
+ 	tmp = Cshift(omegabar,mu,-1);
+	tmp = tmp + omega;
+	face = where(tmp == Integer(2),one,zero );
+
+ 	tmp = Cshift(omega,mu,-1);
+	tmp = tmp + omegabar;
+	face = where(tmp == Integer(2),one,face );
+
+	nface = nface + face;
+
+	sp_proj  = 0.5*(f-G*f*rsgn);
+	projected= where(face,sp_proj,projected);
+
+	// Upper face receives (1+gamma)/2 in normal backward hopping term
+ 	tmp = Cshift(omegabar,mu,1);
+	tmp = tmp + omega;
+	face = where(tmp == Integer(2),one,zero );
+
+ 	tmp = Cshift(omega,mu,1);
+	tmp = tmp + omegabar;
+	face = where(tmp == Integer(2),one,face );
+
+	nface = nface + face;
+
+	sp_proj = 0.5*(f+G*f*rsgn);
+	projected= where(face,sp_proj,projected);
+      }
+      
+    }
+    // Initial Zero() where nface==0.
+    // Keep the spin projected faces where nface==1
+    // Full spinor where nface>=2
+    projected = where(nface>Integer(1),f,projected);
+    f=projected;
+  }
+
+  void ProjectDomain(FermionField &f,int domain)
+  {
+/*
+    GridBase *grid = f.Grid();
+    int dims = grid->Nd();
+    int isDWF= (dims==Nd+1);
+    assert((dims==Nd)||(dims==Nd+1));
+
+    FermionField zz(grid); zz=Zero();
+    LatticeInteger coor(grid);
+    LatticeInteger domaincb(grid); domaincb=Zero();
+    for(int d=0;d<Nd;d++){
+      LatticeCoordinate(coor,d+isDWF);
+      domaincb = domaincb + div(coor,Block[d]);
+    }
+    f = where(mod(domaincb,2)==Integer(domain),f,zz);
+*/
+    Domains.ProjectDomain(f,domain);
+
+  };
+  void ProjectOmegaBar   (FermionField &f) {ProjectDomain(f,OmegaBar);}
+  void ProjectOmega      (FermionField &f) {ProjectDomain(f,Omega);}
+  // See my notes(!).
+  // Notation: Following Luscher, we introduce projectors $\hPdb$ with both spinor and space structure
+  // projecting all spinor elements in $\Omega$ connected by $\Ddb$ to $\bar{\Omega}$,
+  void ProjectBoundaryBar(FermionField &f)
+  {
+    ProjectBoundaryBothDomains(f,1);
+    ProjectOmega(f);
+  }
+  // and $\hPd$ projecting all spinor elements in $\bar{\Omega}$ connected by $\Dd$ to $\Omega$.
+  void ProjectBoundary   (FermionField &f)
+  {
+    ProjectBoundaryBothDomains(f,1);
+    ProjectOmegaBar(f);
+    //    DumpSliceNorm("ProjectBoundary",f,f.Grid()->Nd()-1);
+  };
+
+  void dBoundary    (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmegaBar(tmp);
+    PeriodicFermOpD.M(tmp,out);
+    ProjectOmega(out);
+  };
+  void dBoundaryDag (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmega(tmp);
+    PeriodicFermOpD.Mdag(tmp,out);
+    ProjectOmegaBar(out);
+  };
+  void dBoundaryBar (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmega(tmp);
+    PeriodicFermOpD.M(tmp,out);
+    ProjectOmegaBar(out);
+  };
+  void dBoundaryBarDag (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmegaBar(tmp);
+    PeriodicFermOpD.Mdag(tmp,out);
+    ProjectOmega(out);
+  };
+  void dOmega       (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmega(tmp);
+    DirichletFermOpD.M(tmp,out);
+    ProjectOmega(out);
+  };
+  void dOmegaBar    (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmegaBar(tmp);
+    DirichletFermOpD.M(tmp,out);
+    ProjectOmegaBar(out);
+  };
+  void dOmegaDag       (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmega(tmp);
+    DirichletFermOpD.Mdag(tmp,out);
+    ProjectOmega(out);
+  };
+  void dOmegaBarDag    (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmegaBar(tmp);
+    DirichletFermOpD.Mdag(tmp,out);
+    ProjectOmegaBar(out);
+  };
+  void dOmegaInv   (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmega(tmp);
+    dOmegaInvAndOmegaBarInv(tmp,out); // Inefficient warning
+    ProjectOmega(out);
+  };
+  void dOmegaBarInv(FermionField &in,FermionField &out)
+  {    
+    FermionField tmp(in);
+    ProjectOmegaBar(tmp);
+    dOmegaInvAndOmegaBarInv(tmp,out);
+    ProjectOmegaBar(out);
+  };
+  void dOmegaDagInv   (FermionField &in,FermionField &out)
+  {
+    FermionField tmp(in);
+    ProjectOmega(tmp);
+    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
+    ProjectOmega(out);
+  };
+  void dOmegaBarDagInv(FermionField &in,FermionField &out)
+  {    
+    FermionField tmp(in);
+    ProjectOmegaBar(tmp);
+    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
+    ProjectOmegaBar(out);
+  };
+  void dOmegaInvAndOmegaBarInv(FermionField &in,FermionField &out)
+  {
+    MxPCG OmegaSolver(tol,
+		      tolinner,
+		      maxinnerit,
+		      maxouterit,
+		      DirichletFermOpF.FermionRedBlackGrid(),
+		      DirichletFermOpF,
+		      DirichletFermOpD,
+		      DirichletLinOpF,
+		      DirichletLinOpD);
+    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(OmegaSolver);
+    PrecSolve(DirichletFermOpD,in,out);
+  };
+  void dOmegaDagInvAndOmegaBarDagInv(FermionField &in,FermionField &out)
+  {
+    MxDagPCG OmegaDagSolver(tol,
+			    tolinner,
+			    maxinnerit,
+			    maxouterit,
+			    DirichletFermOpF.FermionRedBlackGrid(),
+			    DirichletFermOpF,
+			    DirichletFermOpD,
+			    DirichletLinOpDagF,
+			    DirichletLinOpDagD);
+    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecSolve(OmegaDagSolver);
+    PrecSolve(DirichletFermOpD,in,out);
+  };
+
+  // Rdag = Pdbar - DdbarDag DomegabarDagInv  DdDag DomegaDagInv Pdbar 
+  void RDag(FermionField &in,FermionField &out)
+  {
+    FermionField tmp1(PeriodicFermOpD.FermionGrid());
+    FermionField tmp2(PeriodicFermOpD.FermionGrid());
+    out = in;
+    ProjectBoundaryBar(out);
+    dOmegaDagInv(out,tmp1);   
+    dBoundaryDag(tmp1,tmp2);   
+    dOmegaBarDagInv(tmp2,tmp1);
+    dBoundaryBarDag(tmp1,tmp2); 
+    out = out - tmp2;
+  };
+
+  // R = Pdbar - Pdbar DomegaInv Dd DomegabarInv Ddbar
+  void R(FermionField &in,FermionField &out)
+  {
+    FermionField tmp1(PeriodicFermOpD.FermionGrid());
+    FermionField tmp2(PeriodicFermOpD.FermionGrid());
+    out = in;
+    ProjectBoundaryBar(out);
+    dBoundaryBar(out,tmp1); 
+    dOmegaBarInv(tmp1,tmp2);
+    dBoundary(tmp2,tmp1);   
+    dOmegaInv(tmp1,tmp2);   
+    out = in - tmp2 ;       
+    ProjectBoundaryBar(out);
+    //    DumpSliceNorm("R",out,out.Grid()->Nd()-1);
+  };
+  
+  // R = Pdbar - Pdbar Dinv Ddbar 
+  void RInv(FermionField &in,FermionField &out)
+  {
+    FermionField tmp1(PeriodicFermOpD.FermionGrid());
+    dBoundaryBar(in,out);
+    Dinverse(out,tmp1);  
+    out =in -tmp1; 
+    ProjectBoundaryBar(out);
+  };
+  // R = Pdbar - DdbarDag DinvDag Pdbar 
+  void RDagInv(FermionField &in,FermionField &out)
+  {
+    FermionField tmp(PeriodicFermOpD.FermionGrid());
+    FermionField Pin(PeriodicFermOpD.FermionGrid());
+    Pin = in; ProjectBoundaryBar(Pin);
+    DinverseDag(Pin,out);  
+    dBoundaryBarDag(out,tmp);
+    out =Pin -tmp; 
+  };
+  // Non-dirichlet inverter using red-black preconditioning
+  void Dinverse(FermionField &in,FermionField &out)
+  {
+    MxPCG DSolver(tol,
+		  tolinner,
+		  maxinnerit,
+		  maxouterit,
+		  PeriodicFermOpF.FermionRedBlackGrid(),
+		  PeriodicFermOpF,
+		  PeriodicFermOpD,
+		  PeriodicLinOpF,
+		  PeriodicLinOpD);
+    SchurRedBlackDiagMooeeSolve<FermionField> Solve(DSolver);
+    Solve(PeriodicFermOpD,in,out);
+  }
+  void DinverseDag(FermionField &in,FermionField &out)
+  {
+    MxDagPCG DdagSolver(tol,
+			tolinner,
+			maxinnerit,
+			maxouterit,
+			PeriodicFermOpF.FermionRedBlackGrid(),
+			PeriodicFermOpF,
+			PeriodicFermOpD,
+			PeriodicLinOpDagF,
+			PeriodicLinOpDagD);
+    SchurRedBlackDiagMooeeDagSolve<FermionField> Solve(DdagSolver);
+    Solve(PeriodicFermOpD,in,out);
+  }
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -4,11 +4,10 @@

    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h

-    Copyright (C) 2017 - 2022
+    Copyright (C) 2017

    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
-    Author: Daniel Richtmann <daniel.richtmann@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,9 +29,7 @@

 #pragma once

-#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
-#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
-#include <Grid/qcd/action/fermion/CloverHelpers.h>
+#include <Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

@@ -52,16 +49,19 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////

-template<class Impl, class CloverHelpers>
-class WilsonCloverFermion : public WilsonFermion<Impl>,
-                            public WilsonCloverHelpers<Impl>
+template <class Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>
 {
 public:
+  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
+  template <typename vtype>
+  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+  typedef iImplClover<Simd> SiteCloverType;
+  typedef Lattice<SiteCloverType> CloverFieldType;

-  typedef WilsonFermion<Impl>       WilsonBase;
-  typedef WilsonCloverHelpers<Impl> Helpers;
+public:
+  typedef WilsonFermion<Impl> WilsonBase;

  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@@ -72,7 +72,42 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams());
+                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                                                                                     Fgrid,
+                                                                                     Hgrid,
+                                                                                     _mass, impl_p, clover_anisotropy),
+                                                                 CloverTerm(&Fgrid),
+                                                                 CloverTermInv(&Fgrid),
+                                                                 CloverTermEven(&Hgrid),
+                                                                 CloverTermOdd(&Hgrid),
+                                                                 CloverTermInvEven(&Hgrid),
+                                                                 CloverTermInvOdd(&Hgrid),
+                                                                 CloverTermDagEven(&Hgrid),
+                                                                 CloverTermDagOdd(&Hgrid),
+                                                                 CloverTermInvDagEven(&Hgrid),
+                                                                 CloverTermInvDagOdd(&Hgrid)
+  {
+    assert(Nd == 4); // require 4 dimensions
+
+    if (clover_anisotropy.isAnisotropic)
+    {
+      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
+      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
+    }
+    else
+    {
+      csw_r = _csw_r * 0.5;
+      diag_mass = 4.0 + _mass;
+    }
+    csw_t = _csw_t * 0.5;
+
+    if (csw_r == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
+    if (csw_t == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
+
+    ImportGauge(_Umu);
+  }

  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@@ -89,21 +124,250 @@ public:
  void ImportGauge(const GaugeField &_Umu);

  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  {
+    conformable(X.Grid(), Y.Grid());
+    conformable(X.Grid(), force.Grid());
+    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+    GaugeField clover_force(force.Grid());
+    PropagatorField Lambda(force.Grid());

-public:
+    // Guido: Here we are hitting some performance issues:
+    // need to extract the components of the DoubledGaugeField
+    // for each call
+    // Possible solution
+    // Create a vector object to store them? (cons: wasting space)
+    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+    Impl::extractLinkField(U, this->Umu);
+
+    force = Zero();
+    // Derivative of the Wilson hopping term
+    this->DhopDeriv(force, X, Y, dag);
+
+    ///////////////////////////////////////////////////////////
+    // Clover term derivative
+    ///////////////////////////////////////////////////////////
+    Impl::outerProductImpl(Lambda, X, Y);
+    //std::cout << "Lambda:" << Lambda << std::endl;
+
+    Gamma::Algebra sigma[] = {
+        Gamma::Algebra::SigmaXY,
+        Gamma::Algebra::SigmaXZ,
+        Gamma::Algebra::SigmaXT,
+        Gamma::Algebra::MinusSigmaXY,
+        Gamma::Algebra::SigmaYZ,
+        Gamma::Algebra::SigmaYT,
+        Gamma::Algebra::MinusSigmaXZ,
+        Gamma::Algebra::MinusSigmaYZ,
+        Gamma::Algebra::SigmaZT,
+        Gamma::Algebra::MinusSigmaXT,
+        Gamma::Algebra::MinusSigmaYT,
+        Gamma::Algebra::MinusSigmaZT};
+
+    /*
+      sigma_{\mu \nu}=
+      | 0         sigma[0]  sigma[1]  sigma[2] |
+      | sigma[3]    0       sigma[4]  sigma[5] |
+      | sigma[6]  sigma[7]     0      sigma[8] |
+      | sigma[9]  sigma[10] sigma[11]   0      |
+    */
+
+    int count = 0;
+    clover_force = Zero();
+    for (int mu = 0; mu < 4; mu++)
+    {
+      force_mu = Zero();
+      for (int nu = 0; nu < 4; nu++)
+      {
+        if (mu == nu)
+        continue;
+        
+        RealD factor;
+        if (nu == 4 || mu == 4)
+        {
+          factor = 2.0 * csw_t;
+        }
+        else
+        {
+          factor = 2.0 * csw_r;
+        }
+        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
+        count++;
+      }
+
+      pokeLorentz(clover_force, U[mu] * force_mu, mu);
+    }
+    //clover_force *= csw;
+    force += clover_force;
+  }
+
+  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
+  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
+  {
+    conformable(lambda.Grid(), U[0].Grid());
+    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
+    // insertion in upper staple
+    // please check redundancy of shift operations
+
+    // C1+
+    tmp = lambda * U[nu];
+    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C2+
+    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C3+
+    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
+
+    // C4+
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
+
+    // insertion in lower staple
+    // C1-
+    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C2-
+    tmp = adj(lambda) * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C3-
+    tmp = lambda * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
+
+    // C4-
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
+
+    return out;
+  }
+
+protected:
  // here fixing the 4 dimensions, make it more general?

  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverField CloverTerm, CloverTermInv;                     // Clover term
-  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
-  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
-  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
-  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
-};
+  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO

+ public:
+  // eventually these can be compressed into 6x6 blocks instead of the 12x12
+  // using the DeGrand-Rossi basis for the gamma matrices
+  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
+    {
+      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    autoView(T_v, T,AcceleratorWrite);
+    autoView(F_v, F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
+    {
+      T_v[i]()(0, 1) = -F_v[i]()();
+      T_v[i]()(1, 0) = F_v[i]()();
+      T_v[i]()(2, 3) = -F_v[i]()();
+      T_v[i]()(3, 2) = F_v[i]()();
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXY(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
+    {
+      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 1) = timesI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+
+    autoView( T_v , T, AcceleratorWrite);
+    autoView( F_v , F, AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
+    {
+      T_v[i]()(0, 1) = timesI(F_v[i]()());
+      T_v[i]()(1, 0) = timesI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverYT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    autoView( T_v ,T,AcceleratorWrite);
+    autoView( F_v ,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
+    {
+      T_v[i]()(0, 1) = -(F_v[i]()());
+      T_v[i]()(1, 0) = (F_v[i]()());
+      T_v[i]()(2, 3) = (F_v[i]()());
+      T_v[i]()(3, 2) = -(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverZT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+
+    T = Zero();
+
+    autoView( T_v , T,AcceleratorWrite);
+    autoView( F_v , F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
+    {
+      T_v[i]()(0, 0) = timesI(F_v[i]()());
+      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });
+
+    return T;
+  }
+};
 NAMESPACE_END(Grid);


--- a/Show More
+++ b/Show More