Fix and compiles

Typo
first try at A2A four quark offload
2026-03-16 09:16:10 +00:00 · 2020-08-12 14:35:08 -04:00 · 2020-08-12 14:24:39 -04:00 · 2020-08-12 14:17:46 -04:00
493 changed files with 4889 additions and 32692 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -88,7 +88,6 @@ Thumbs.db
 # build directory #
 ###################
 build*/*
 Documentation/_build
 # IDE related files #
 #####################
--- a/.travis.yml
+++ b/.travis.yml
@@ -0,0 +1,61 @@
 language: cpp
 cache:
  directories:
    - clang
 matrix:
  include:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=single
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=double
 before_install:
    - export GRIDDIR=`pwd`
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
 install:
    - export CWD=`pwd`
    - echo $CWD
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which autoconf
    - autoconf  --version
    - which automake
    - automake  --version
    - which $CC
    - $CC  --version
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
 script:
    - ./bootstrap.sh
    - mkdir build
    - cd build
    - mkdir lime
    - cd lime
    - mkdir build
    - cd build
    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
    - tar xf lime-1.3.2.tar.gz
    - cd lime-1.3.2
    - ./configure --prefix=$CWD/build/lime/install
    - make -j4
    - make install
    - cd $CWD/build
    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -34,15 +34,10 @@ directory
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wpsabi"
 #endif
 //disables and intel compiler specific warning (in json.hpp)
 #ifdef __ICC
 #pragma warning disable 488  
 #endif
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -36,7 +36,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/GridStd.h
+++ b/Grid/GridStd.h
@@ -28,7 +28,4 @@
 ///////////////////
 #include "Config.h"
 #ifdef TOFU
 #undef GRID_COMMS_THREADS
 #endif
 #endif /* GRID_STD_H */
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,12 +34,6 @@
 #define __SYCL__REDEFINE__
 #endif
 /* HIP save and restore compile environment*/
 #ifdef GRID_HIP
 #pragma push
 #pragma push_macro("__HIP_DEVICE_COMPILE__")
 #endif
 #define EIGEN_NO_HIP
 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -48,7 +42,7 @@
 #ifdef __NVCC__REDEFINE__
 #pragma pop_macro("__CUDACC__")
 #pragma pop_macro("__NVCC__")
-#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("GRID_SIMT")
 #pragma pop
 #endif
@@ -58,12 +52,6 @@
 #pragma pop
 #endif
 /*HIP restore*/
 #ifdef __HIP__REDEFINE__
 #pragma pop_macro("__HIP_DEVICE_COMPILE__")
 #pragma pop
 #endif
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -21,7 +21,6 @@ if BUILD_HDF5
  extra_headers+=serialisation/Hdf5Type.h
 endif
 all: version-cache Version.h
 version-cache:
@@ -54,19 +53,6 @@ Version.h: version-cache
 include Make.inc
 include Eigen.inc
 extra_sources+=$(WILS_FERMION_FILES)
 extra_sources+=$(STAG_FERMION_FILES)
 if BUILD_ZMOBIUS
  extra_sources+=$(ZWILS_FERMION_FILES)
 endif
 if BUILD_GPARITY
  extra_sources+=$(GP_FERMION_FILES)
 endif
 if BUILD_FERMION_REPS
  extra_sources+=$(ADJ_FERMION_FILES)
  extra_sources+=$(TWOIND_FERMION_FILES)
 endif
 lib_LIBRARIES = libGrid.a
 CCFILES += $(extra_sources)
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,7 +54,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H
 #include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
 NAMESPACE_BEGIN(Grid);
@@ -60,14 +59,12 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
 class Geometry {
 public:
  int npoint;
  int base;
  std::vector<int> directions   ;
  std::vector<int> displacements;
  std::vector<int> points_dagger;
  Geometry(int _d)  {
-    base = (_d==5) ? 1:0;
+    int base = (_d==5) ? 1:0;
    // make coarse grid stencil for 4d , not 5d
    if ( _d==5 ) _d=4;
@@ -75,51 +72,16 @@ public:
    npoint = 2*_d+1;
    directions.resize(npoint);
    displacements.resize(npoint);
    points_dagger.resize(npoint);
    for(int d=0;d<_d;d++){
      directions[d   ] = d+base;
      directions[d+_d] = d+base;
      displacements[d  ] = +1;
      displacements[d+_d]= -1;
      points_dagger[d   ] = d+_d;
      points_dagger[d+_d] = d;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
    points_dagger[2*_d]=2*_d;
  }
  int point(int dir, int disp) {
    assert(disp == -1 || disp == 0 || disp == 1);
    assert(base+0 <= dir && dir < base+4);
    // directions faster index = new indexing
    // 4d (base = 0):
    // point 0  1  2  3  4  5  6  7  8
    // dir   0  1  2  3  0  1  2  3  0
    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
    // 5d (base = 1):
    // point 0  1  2  3  4  5  6  7  8
    // dir   1  2  3  4  1  2  3  4  0
    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
    // displacements faster index = old indexing
    // 4d (base = 0):
    // point 0  1  2  3  4  5  6  7  8
    // dir   0  0  1  1  2  2  3  3  0
    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
    // 5d (base = 1):
    // point 0  1  2  3  4  5  6  7  8
    // dir   1  1  2  2  3  3  4  4  0
    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
    if(dir == 0 and disp == 0)
      return 8;
    else // New indexing
      return (1 - disp) / 2 * 4 + dir - base;
    // else // Old indexing
    //   return (4 * (dir - base) + 1 - disp) / 2;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
@@ -296,7 +258,7 @@ public:
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
-class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
  typedef iVector<CComplex,nbasis >           siteVector;
@@ -306,59 +268,33 @@ public:
  typedef iMatrix<CComplex,nbasis >  Cobj;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
  typedef CoarseVector FermionField;
  // enrich interface, use default implementation as in FermionOperator ///////
  void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
  void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
  void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
  void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
  void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
  void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
  ////////////////////
  // Data members
  ////////////////////
  Geometry         geom;
  GridBase *       _grid; 
  GridBase*        _cbgrid;
  int hermitian;
  CartesianStencil<siteVector,siteVector,int> Stencil; 
  CartesianStencil<siteVector,siteVector,int> StencilEven;
  CartesianStencil<siteVector,siteVector,int> StencilOdd;
  std::vector<CoarseMatrix> A;
-  std::vector<CoarseMatrix> Aeven;
+    
  std::vector<CoarseMatrix> Aodd;
  CoarseMatrix AselfInv;
  CoarseMatrix AselfInvEven;
  CoarseMatrix AselfInvOdd;
  Vector<RealD> dag_factor;
  ///////////////////////
  // Interface
  ///////////////////////
  GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid()     { return _cbgrid; };
  int ConstEE() { return 0; }
  void M (const CoarseVector &in, CoarseVector &out)
  {
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());
    out.Checkerboard() = in.Checkerboard();
    SimpleCompressor<siteVector> compressor;
    Stencil.HaloExchange(in,compressor);
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
    auto& geom_v = geom;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -380,14 +316,14 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<geom.npoint;point++){
-	SE=Stencil_v.GetEntry(ptype,point,ss);
+	SE=Stencil.GetEntry(ptype,point,ss);
 	if(SE->_is_local) { 
 	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
 	}
 	acceleratorSynchronise();
@@ -408,74 +344,12 @@ public:
      return M(in,out);
    } else {
      // corresponds to Galerkin coarsening
-      return MdagNonHermitian(in, out);
+      CoarseVector tmp(Grid());
      G5C(tmp, in); 
      M(tmp, out);
      G5C(out, out);
    }
  };
  void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
  {
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());
    out.Checkerboard() = in.Checkerboard();
    SimpleCompressor<siteVector> compressor;
    Stencil.HaloExchange(in,compressor);
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
    auto& geom_v = geom;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
    int osites=Grid()->oSites();
    Vector<int> points(geom.npoint, 0);
    for(int p=0; p<geom.npoint; p++)
      points[p] = geom.points_dagger[p];
    auto points_p = &points[0];
    RealD* dag_factor_p = &dag_factor[0];
    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
      int ss = sss/nbasis;
      int b  = sss%nbasis;
      calcComplex res = Zero();
      calcVector nbr;
      int ptype;
      StencilEntry *SE;
      for(int p=0;p<geom_v.npoint;p++){
        int point = points_p[p];
 	SE=Stencil_v.GetEntry(ptype,point,ss);
 	if(SE->_is_local) {
 	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
 	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
 	}
 	acceleratorSynchronise();
 	for(int bb=0;bb<nbasis;bb++) {
 	  res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
      }
      coalescedWrite(out_v[ss](b),res);
      });
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
  void MdirComms(const CoarseVector &in)
  {
    SimpleCompressor<siteVector> compressor;
@@ -485,7 +359,6 @@ public:
  {
    conformable(_grid,in.Grid());
    conformable(_grid,out.Grid());
    out.Checkerboard() = in.Checkerboard();
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -494,7 +367,6 @@ public:
    autoView( out_v , out, AcceleratorWrite);
    autoView( in_v  , in, AcceleratorRead);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
@@ -508,12 +380,12 @@ public:
      int ptype;
      StencilEntry *SE;
-      SE=Stencil_v.GetEntry(ptype,point,ss);
+      SE=Stencil.GetEntry(ptype,point,ss);
      if(SE->_is_local) { 
 	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
      } else {
-	nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
      }
      acceleratorSynchronise();
@@ -541,7 +413,34 @@ public:
    this->MdirComms(in);
-    MdirCalc(in,out,geom.point(dir,disp));
+    int ndim = in.Grid()->Nd();
    //////////////
    // 4D action like wilson
    // 0+ => 0 
    // 0- => 1
    // 1+ => 2 
    // 1- => 3
    // etc..
    //////////////
    // 5D action like DWF
    // 1+ => 0 
    // 1- => 1
    // 2+ => 2 
    // 2- => 3
    // etc..
    auto point = [dir, disp, ndim](){
      if(dir == 0 and disp == 0)
 	return 8;
      else if ( ndim==4 ) { 
 	return (4 * dir + 1 - disp) / 2;
      } else { 
 	return (4 * (dir-1) + 1 - disp) / 2;
      }
    }();
    MdirCalc(in,out,point);
  };
  void Mdiag(const CoarseVector &in, CoarseVector &out)
@@ -550,298 +449,23 @@ public:
    MdirCalc(in, out, point); // No comms
  };
  void Mooee(const CoarseVector &in, CoarseVector &out) {
    MooeeInternal(in, out, DaggerNo, InverseNo);
  }
  void MooeeInv(const CoarseVector &in, CoarseVector &out) {
    MooeeInternal(in, out, DaggerNo, InverseYes);
  }
  void MooeeDag(const CoarseVector &in, CoarseVector &out) {
    MooeeInternal(in, out, DaggerYes, InverseNo);
  }
  void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
    MooeeInternal(in, out, DaggerYes, InverseYes);
  }
  void Meooe(const CoarseVector &in, CoarseVector &out) {
    if(in.Checkerboard() == Odd) {
      DhopEO(in, out, DaggerNo);
    } else {
      DhopOE(in, out, DaggerNo);
    }
  }
  void MeooeDag(const CoarseVector &in, CoarseVector &out) {
    if(in.Checkerboard() == Odd) {
      DhopEO(in, out, DaggerYes);
    } else {
      DhopOE(in, out, DaggerYes);
    }
  }
  void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
    conformable(in.Grid(), _grid); // verifies full grid
    conformable(in.Grid(), out.Grid());
    out.Checkerboard() = in.Checkerboard();
    DhopInternal(Stencil, A, in, out, dag);
  }
  void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
    conformable(in.Grid(), _cbgrid);    // verifies half grid
    conformable(in.Grid(), out.Grid()); // drops the cb check
    assert(in.Checkerboard() == Even);
    out.Checkerboard() = Odd;
    DhopInternal(StencilEven, Aodd, in, out, dag);
  }
  void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
    conformable(in.Grid(), _cbgrid);    // verifies half grid
    conformable(in.Grid(), out.Grid()); // drops the cb check
    assert(in.Checkerboard() == Odd);
    out.Checkerboard() = Even;
    DhopInternal(StencilOdd, Aeven, in, out, dag);
  }
  void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
    out.Checkerboard() = in.Checkerboard();
    assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
    CoarseMatrix *Aself = nullptr;
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
        DselfInternal(StencilOdd, *Aself, in, out, dag);
      } else {
        Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
        DselfInternal(StencilEven, *Aself, in, out, dag);
      }
    } else {
      Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
      DselfInternal(Stencil, *Aself, in, out, dag);
    }
    assert(Aself != nullptr);
  }
  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
                       const CoarseVector &in, CoarseVector &out, int dag) {
    int point = geom.npoint-1;
    autoView( out_v, out, AcceleratorWrite);
    autoView( in_v,  in,  AcceleratorRead);
    autoView( st_v,  st,  AcceleratorRead);
    autoView( a_v,   a,   AcceleratorRead);
    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
    RealD* dag_factor_p = &dag_factor[0];
    if(dag) {
      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
        int ss = sss/nbasis;
        int b  = sss%nbasis;
        calcComplex res = Zero();
        calcVector nbr;
        int ptype;
        StencilEntry *SE;
        SE=st_v.GetEntry(ptype,point,ss);
        if(SE->_is_local) {
          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
        } else {
          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
        }
        acceleratorSynchronise();
        for(int bb=0;bb<nbasis;bb++) {
          res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
        }
        coalescedWrite(out_v[ss](b),res);
      });
    } else {
      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
        int ss = sss/nbasis;
        int b  = sss%nbasis;
        calcComplex res = Zero();
        calcVector nbr;
        int ptype;
        StencilEntry *SE;
        SE=st_v.GetEntry(ptype,point,ss);
        if(SE->_is_local) {
          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
        } else {
          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
        }
        acceleratorSynchronise();
        for(int bb=0;bb<nbasis;bb++) {
          res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
        }
        coalescedWrite(out_v[ss](b),res);
      });
    }
  }
  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
                    const CoarseVector &in, CoarseVector &out, int dag) {
    SimpleCompressor<siteVector> compressor;
    st.HaloExchange(in,compressor);
    autoView( in_v,  in,  AcceleratorRead);
    autoView( out_v, out, AcceleratorWrite);
    autoView( st_v , st,  AcceleratorRead);
    typedef LatticeView<Cobj> Aview;
    // determine in what order we need the points
    int npoint = geom.npoint-1;
    Vector<int> points(npoint, 0);
    for(int p=0; p<npoint; p++)
      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
    auto points_p = &points[0];
    Vector<Aview> AcceleratorViewContainer;
    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
    RealD* dag_factor_p = &dag_factor[0];
    if(dag) {
      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
        int ss = sss/nbasis;
        int b  = sss%nbasis;
        calcComplex res = Zero();
        calcVector nbr;
        int ptype;
        StencilEntry *SE;
        for(int p=0;p<npoint;p++){
          int point = points_p[p];
          SE=st_v.GetEntry(ptype,point,ss);
          if(SE->_is_local) {
            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
          } else {
            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
          }
          acceleratorSynchronise();
          for(int bb=0;bb<nbasis;bb++) {
            res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
          }
        }
        coalescedWrite(out_v[ss](b),res);
      });
    } else {
      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
        int ss = sss/nbasis;
        int b  = sss%nbasis;
        calcComplex res = Zero();
        calcVector nbr;
        int ptype;
        StencilEntry *SE;
        for(int p=0;p<npoint;p++){
          int point = points_p[p];
          SE=st_v.GetEntry(ptype,point,ss);
          if(SE->_is_local) {
            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
          } else {
            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
          }
          acceleratorSynchronise();
          for(int bb=0;bb<nbasis;bb++) {
            res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
          }
        }
        coalescedWrite(out_v[ss](b),res);
      });
    }
    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
-  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
+ CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	: 
    _grid(&CoarseGrid),
    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+      A(geom.npoint,&CoarseGrid)
    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,_cbgrid),
    Aodd(geom.npoint,_cbgrid),
    AselfInv(&CoarseGrid),
    AselfInvEven(_cbgrid),
    AselfInvOdd(_cbgrid),
    dag_factor(nbasis*nbasis)
  {
    fillFactor();
  };
  CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:
    _grid(&CoarseGrid),
    _cbgrid(&CoarseRBGrid),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,&CoarseRBGrid),
    Aodd(geom.npoint,&CoarseRBGrid),
    AselfInv(&CoarseGrid),
    AselfInvEven(&CoarseRBGrid),
    AselfInvOdd(&CoarseRBGrid),
    dag_factor(nbasis*nbasis)
  {
    fillFactor();
  };
  void fillFactor() {
    Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
    if(!hermitian) {
      const int nb = nbasis/2;
      dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
      dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
    }
    // GPU readable prefactor
    thread_for(i, nbasis*nbasis, {
      int j = i/nbasis;
      int k = i%nbasis;
      dag_factor[i] = dag_factor_eigen(j, k);
    });
  }
  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
    typedef typename Fobj::scalar_type scalar_type;
    std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
@@ -872,13 +496,11 @@ public:
    CoarseScalar InnerProd(Grid()); 
    std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
    // Orthogonalise the subblocks over the basis
    blockOrthogonalise(InnerProd,Subspace.subspace);
    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
    std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
    int self_stencil=-1;
    for(int p=0;p<geom.npoint;p++)
    { 
@@ -917,7 +539,7 @@ public:
      phi=Subspace.subspace[i];
-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
      linop.OpDirAll(phi,Mphi_p);
      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
@@ -946,18 +568,6 @@ public:
 	    autoView( A_self  , A[self_stencil], AcceleratorWrite);
 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
 	    if ( hermitian && (disp==-1) ) {
 	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
 		int dirp   = geom.directions[pp];
 		int dispp  = geom.displacements[pp];
 		if ( (dirp==dir) && (dispp==1) ){
 		  auto sft = conjugate(Cshift(oZProj,dir,1));
 		  autoView( sft_v    ,  sft  , AcceleratorWrite);
 		  autoView( A_pp     ,  A[pp], AcceleratorWrite);
 		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
 		}
 	      }
 	    }
 	  }
 	}
@@ -996,54 +606,28 @@ public:
    }
    if(hermitian) {
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
      ForceHermitian();
    }
    InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
    FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
  }
-  void InvertSelfStencilLink() {
+  void ForceHermitian(void) {
-    std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
+    CoarseMatrix Diff  (Grid());
-    int localVolume = Grid()->lSites();
+    for(int p=0;p<geom.npoint;p++){
-
+      int dir   = geom.directions[p];
-    typedef typename Cobj::scalar_object scalar_object;
+      int disp  = geom.displacements[p];
-
+      if(disp==-1) {
-    autoView(Aself_v,    A[geom.npoint-1], CpuRead);
+	// Find the opposite link
-    autoView(AselfInv_v, AselfInv,         CpuWrite);
+	for(int pp=0;pp<geom.npoint;pp++){
-    thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
+	  int dirp   = geom.directions[pp];
-      Eigen::MatrixXcd selfLinkEigen    = Eigen::MatrixXcd::Zero(nbasis, nbasis);
+	  int dispp  = geom.displacements[pp];
-      Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
+	  if ( (dirp==dir) && (dispp==1) ){
-
+	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
-      scalar_object selfLink    = Zero();
+	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
-      scalar_object selfLinkInv = Zero();
+	    A[pp] = adj(Cshift(A[p],dir,1));
-
+	  }
-      Coordinate lcoor;
+	}
-
+      }
      Grid()->LocalIndexToLocalCoor(site, lcoor);
      peekLocalSite(selfLink, Aself_v, lcoor);
      for (int i = 0; i < nbasis; ++i)
        for (int j = 0; j < nbasis; ++j)
          selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
      selfLinkInvEigen = selfLinkEigen.inverse();
      for(int i = 0; i < nbasis; ++i)
        for(int j = 0; j < nbasis; ++j)
          selfLinkInv(i, j) = selfLinkInvEigen(i, j);
      pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
    });
  }
  void FillHalfCbs() {
    std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
    for(int p = 0; p < geom.npoint; ++p) {
      pickCheckerboard(Even, Aeven[p], A[p]);
      pickCheckerboard(Odd, Aodd[p], A[p]);
    }
    pickCheckerboard(Even, AselfInvEven, AselfInv);
    pickCheckerboard(Odd, AselfInvOdd, AselfInv);
  }
 };
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -136,7 +136,7 @@ public:
    flops=0;
    usec =0;
    Coordinate layout(Nd,1);
-    sgrid = new GridCartesian(dimensions,layout,processors,*grid);
+    sgrid = new GridCartesian(dimensions,layout,processors);
  };
  ~FFT ( void)  {
@@ -182,7 +182,7 @@ public:
    pencil_gd[dim] = G*processors[dim];
    // Pencil global vol LxLxGxLxL per node
-    GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
+    GridCartesian pencil_g(pencil_gd,layout,processors);
    // Construct pencils
    typedef typename vobj::scalar_object sobj;
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -223,14 +223,9 @@ class SchurOperatorBase :  public LinearOperatorBase<Field> {
    Mpc(in,tmp);
    MpcDag(tmp,out);
  }
  virtual  void MpcMpcDag(const Field &in, Field &out) {
    Field tmp(in.Grid());
    tmp.Checkerboard() = in.Checkerboard();
    MpcDag(in,tmp);
    Mpc(tmp,out);
  }
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    HermOp(in,out);
+    out.Checkerboard() = in.Checkerboard();
    MpcDagMpc(in,out);
    ComplexD dot= innerProduct(in,out); 
    n1=real(dot);
    n2=norm2(out);
@@ -281,16 +276,6 @@ template<class Matrix,class Field>
      axpy(out,-1.0,tmp,out);
    }
 };
 // Mpc MpcDag system presented as the HermOp
 template<class Matrix,class Field>
 class SchurDiagMooeeDagOperator :  public SchurDiagMooeeOperator<Matrix,Field> {
 public:
  virtual void HermOp(const Field &in, Field &out){
    out.Checkerboard() = in.Checkerboard();
    this->MpcMpcDag(in,out);
  }
  SchurDiagMooeeDagOperator (Matrix &Mat): SchurDiagMooeeOperator<Matrix,Field>(Mat){};
 };
 template<class Matrix,class Field>
  class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
 protected:
@@ -545,16 +530,6 @@ public:
 template<class Field> class LinearFunction {
 public:
  virtual void operator() (const Field &in, Field &out) = 0;
  virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
  {
    assert(in.size() == out.size());
    for (unsigned int i = 0; i < in.size(); ++i)
    {
      (*this)(in[i], out[i]);
    }
  }
 };
 template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -292,7 +292,6 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -102,7 +102,7 @@ public:
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      TrueResidual = std::sqrt(a/ssq);
-      std::cout << GridLogMessage << "ConjugateGradient guess is converged already "<<TrueResidual<< " tol "<< Tolerance<< std::endl;
+      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
      IterationsToComplete = 0;	
      return;
    }
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -48,29 +48,19 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
-
+    
-    MixedPrecisionConjugateGradient(RealD Tol,
+    MixedPrecisionConjugateGradient(RealD tol, 
 				    Integer maxinnerit, 
 				    Integer maxouterit, 
 				    GridBase* _sp_grid, 
 				    LinearOperatorBase<FieldF> &_Linop_f, 
 				    LinearOperatorBase<FieldD> &_Linop_d) :
      MixedPrecisionConjugateGradient(Tol, Tol, maxinnerit, maxouterit, _sp_grid, _Linop_f, _Linop_d) {};
    MixedPrecisionConjugateGradient(RealD Tol,
 				    RealD InnerTol,
 				    Integer maxinnerit, 
 				    Integer maxouterit, 
 				    GridBase* _sp_grid, 
 				    LinearOperatorBase<FieldF> &_Linop_f, 
 				    LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
-      Tolerance(Tol), InnerTolerance(InnerTol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
-      OuterLoopNormMult(100.), guesser(NULL){ assert(InnerTol < 1.0e-1);};
+      OuterLoopNormMult(100.), guesser(NULL){ };
    void useGuesser(LinearFunction<FieldF> &g){
      guesser = &g;
@@ -89,11 +79,6 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;
    GridBase* DoublePrecGrid = src_d_in.Grid();
    //Generate precision change workspaces
    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
@@ -134,7 +119,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d, wk_sp_from_dp);
+      precisionChange(src_f, src_d);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@@ -152,7 +137,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
+      precisionChange(tmp_d, sol_f);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -164,7 +149,6 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -182,9 +182,6 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -1,411 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d, wk_f_from_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f, wk_d_from_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d, wk_f_from_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -54,23 +54,15 @@ class DeflatedGuesser: public LinearFunction<Field> {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
  const unsigned int       N;
 public:
-  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
  : DeflatedGuesser(_evec, _eval, _evec.size())
  {}
  DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
  : evec(_evec), eval(_eval), N(_N)
  {
    assert(evec.size()==eval.size());
    assert(N <= evec.size());
  } 
  virtual void operator()(const Field &src,Field &guess) {
    guess = Zero();
    assert(evec.size()==eval.size());
    auto N = evec.size();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -40,7 +40,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
-   * L^{-dag}= ( 1      -Mee^{-dag} Moe^{dag} )
+   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
@@ -82,8 +82,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
-   *
+   * TODO: Deflation 
   *
   */
 namespace Grid {
@@ -98,7 +97,6 @@ namespace Grid {
  protected:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
    bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
@@ -134,31 +132,6 @@ namespace Grid {
      (*this)(_Matrix,in,out,guess);
    }
    void RedBlackSource(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &src_o) 
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      Field tmp(grid);
      int nblock = in.size();
      for(int b=0;b<nblock;b++){
 	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
      }
    }
    // James can write his own deflated guesser
    // with optimised code for the inner products
    //    RedBlackSolveSplitGrid();
    //    RedBlackSolve(_Matrix,src_o,sol_o); 
    void RedBlackSolution(Matrix &_Matrix, const std::vector<Field> &in, const std::vector<Field> &sol_o, std::vector<Field> &out)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      Field tmp(grid);
      int nblock = in.size();
      for(int b=0;b<nblock;b++) {
 	pickCheckerboard(Even,tmp,in[b]);
 	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
      }
    }
    template<class Guesser>
    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
    {
@@ -177,29 +150,24 @@ namespace Grid {
      ////////////////////////////////////////////////
      // Prepare RedBlack source
      ////////////////////////////////////////////////
-      RedBlackSource(_Matrix,in,src_o);
+      for(int b=0;b<nblock;b++){
-	//      for(int b=0;b<nblock;b++){
+	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
-	//	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
+      }
 	//      }
      ////////////////////////////////////////////////
      // Make the guesses
      ////////////////////////////////////////////////
      if ( subGuess ) guess_save.resize(nblock,grid);
-      
+      for(int b=0;b<nblock;b++){
-      if(useSolnAsInitGuess) {
+        if(useSolnAsInitGuess) {
        for(int b=0;b<nblock;b++){
          pickCheckerboard(Odd, sol_o[b], out[b]);
        } else {
          guess(src_o[b],sol_o[b]); 
        }
      } else {
        guess(src_o, sol_o); 
      }
-	    if ( subGuess ) { 
+	if ( subGuess ) { 
-        for(int b=0;b<nblock;b++){
+	  guess_save[b] = sol_o[b];
-          guess_save[b] = sol_o[b];
+	}
        }
      }
      //////////////////////////////////////////////////////////////
      // Call the block solver
@@ -221,20 +189,13 @@ namespace Grid {
 	/////////////////////////////////////////////////
 	// Check unprec residual if possible
 	/////////////////////////////////////////////////
-	if ( ! subGuess ) {	  
+	if ( ! subGuess ) {
-
+	  _Matrix.M(out[b],resid); 
 	  if ( this->adjoint() ) _Matrix.Mdag(out[b],resid); 
 	  else                   _Matrix.M(out[b],resid); 
 	  resid = resid-in[b];
 	  RealD ns = norm2(in[b]);
 	  RealD nr = norm2(resid);
-	  std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	  if ( this->adjoint() ) 
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	  else                   
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
 	}
@@ -288,21 +249,12 @@ namespace Grid {
      // Verify the unprec residual
      if ( ! subGuess ) {
-
+        _Matrix.M(out,resid); 
 	std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
 	if ( this->adjoint() ) _Matrix.Mdag(out,resid); 
 	else                   _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
-	  if ( this->adjoint() ) 
+        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
 	  else                   
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
      } else {
        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
@@ -311,7 +263,6 @@ namespace Grid {
    /////////////////////////////////////////////////////////////
    // Override in derived. 
    /////////////////////////////////////////////////////////////
    virtual bool adjoint(void) { return false; }
    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
@@ -665,127 +616,6 @@ namespace Grid {
        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
      }
  };
  /*
   * Red black Schur decomposition
   *
   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
   *                =         L                     D                     U
   *
   * L^-1 = (1              0 )
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
   *        (0    1           )
   * U^{dag} = ( 1                 0)
   *           (Meo^dag Mee^{-dag} 1)
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   *
   *
   ***********************
   *     M^dag psi = eta
   ***********************
   *
   * Really for Mobius: (Wilson - easier to just use gamma 5 hermiticity)
   *
   *    Mdag psi     =         Udag  Ddag  Ldag psi = eta
   *
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   *
   *
   * i)                D^dag phi =  (U^{-dag}  eta)
   *                        eta'_e = eta_e
   *                        eta'_o = (eta_o - Meo^dag Mee^{-dag} eta_e)
   * 
   *      phi_o = D_oo^-dag eta'_o = D_oo^-dag (eta_o - Meo^dag Mee^{-dag} eta_e)
   *
   *      phi_e = D_ee^-dag eta'_e = D_ee^-dag eta_e
   * 
   * Solve: 
   *
   *      D_oo D_oo^dag phi_o = D_oo (eta_o - Meo^dag Mee^{-dag} eta_e)
   *
   * ii) 
   *      phi = L^dag psi => psi = L^-dag phi. 
   *
   * L^{-dag} = ( 1      -Mee^{-dag} Moe^{dag} )
   *            ( 0       1                    )
   *
   *   => sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
   *   => sol_o = phi_o
   */
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal has Mooee on it, but solve the Adjoint system
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagMooeeDagSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    virtual bool adjoint(void) { return true; }
    SchurRedBlackDiagMooeeDagSolve(OperatorFunction<Field> &HermitianRBSolver,
 				   const bool initSubGuess = false,
 				   const bool _solnAsInitGuess = false)  
      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
    //////////////////////////////////////////////////////
    // Override RedBlack specialisation
    //////////////////////////////////////////////////////
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe^dag MeeInvDag source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInvDag(src_e,tmp);  assert(  tmp.Checkerboard() ==Even);
      _Matrix.MeooeDag   (tmp,Mtmp);   assert( Mtmp.Checkerboard() ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
      // get the right Mpc
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      _HermOpEO.Mpc(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);
    }
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field  sol_e(grid);
      Field  tmp(grid);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
      // sol_o = phi_o
      ///////////////////////////////////////////////////
      _Matrix.MeooeDag(sol_o,tmp);      assert(tmp.Checkerboard()==Even);
      tmp = src_e-tmp;                  assert(tmp.Checkerboard()==Even);
      _Matrix.MooeeInvDag(tmp,sol_e);   assert(sol_e.Checkerboard()==Even);
      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
    }
  };
 }
 #endif
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -0,0 +1,67 @@
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
  int fd = open("/proc/self/pagemap", O_RDONLY);
  assert(fd >= 0);
  const int page_size = 4096;
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
  n4ktotal = 0;
  nnothuge = 0;
  for (int i = 0; i < nhugepages; ++i) {
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
    for (int j = 0; j < 512; ++j) {
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
      ++n4ktotal;
      if (pageaddr != baseaddr + j * page_size)
 	++nnothuge;
    }
  }
  int rank = CartesianCommunicator::RankWorld();
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 #endif
 }
 std::string sizeString(const size_t bytes)
 {
  constexpr unsigned int bufSize = 256;
  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
  char                   buf[256];
  size_t                 s     = 0;
  double                 count = bytes;
  while (count >= 1024 && s < 7)
    {
      s++;
      count /= 1024;
    }
  if (count - floor(count) == 0.0)
    {
      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
    }
  else
    {
      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
    }
  return std::string(buf);
 }
 NAMESPACE_END(Grid);
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -65,7 +65,8 @@ public:
    MemoryManager::CpuFree((void *)__p,bytes);
  }
-  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
+  // FIXME: hack for the copy constructor, eventually it must be avoided
  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
  void construct(pointer __p, const _Tp& __val) { assert(0);};
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
@@ -73,9 +74,6 @@ public:
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 //////////////////////////////////////////////////////////////////////////////////////
 // Unified virtual memory
 //////////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class uvmAllocator {
 public: 
@@ -111,72 +109,22 @@ public:
    MemoryManager::SharedFree((void *)__p,bytes);
  }
  // FIXME: hack for the copy constructor, eventually it must be avoided
  void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
  //void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Device memory
 ////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class devAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef devAllocator<_Tp1> other; };
  devAllocator() throw() { }
  devAllocator(const devAllocator&) throw() { }
  template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
  ~devAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) 
  { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
    MemoryManager::AcceleratorFree((void *)__p,bytes);
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-#ifdef ACCELERATOR_CSHIFT
+template<class T> using commAllocator = uvmAllocator<T>;
-// Cshift on device
+template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
-template<class T> using cshiftAllocator = devAllocator<T>;
+template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
-#else
+//template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 // Cshift on host
 template<class T> using cshiftAllocator = std::allocator<T>;
 #endif
 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,devAllocator<T> >;
 template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
 NAMESPACE_END(Grid);
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -9,30 +9,14 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
-  std::cout << " MemoryManager : PrintBytes "<<std::endl;
+  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
-  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
  uint64_t cacheBytes;
  cacheBytes = CacheBytes[Cpu];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Acc];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Shared];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
 }
 //////////////////////////////////////////////////////////////////////
@@ -40,114 +24,86 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
-uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
+
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
  total_device+=bytes;
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
  total_device-=bytes;
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
    total_device-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorFree "<<std::endl;
  PrintBytes();
 #endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
  total_shared+=bytes;
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
  total_shared-=bytes;
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
    total_shared-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #endif
@@ -180,20 +136,11 @@ void MemoryManager::Init(void)
      Ncache[SharedSmall]=Nc;
    }
  }
 }
 void MemoryManager::InitMessage(void) {
 #ifndef GRID_UVM
  std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
 #endif
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 #endif
-  
+
 #ifdef GRID_UVM
  std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
 #ifdef GRID_CUDA
@@ -217,7 +164,6 @@ void MemoryManager::InitMessage(void) {
  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
 #endif
 #endif
 }
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
@@ -225,13 +171,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
 #else
  return ptr;
 #endif
 }
-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -255,7 +201,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  if ( entries[v].valid ) {
    ret = entries[v].address;
    cacheBytes -= entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@@ -264,7 +209,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
  cacheBytes += bytes;
  return ret;
 }
@@ -274,13 +218,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache]);
 #else
  return NULL;
 #endif
 }
-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -289,7 +233,6 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
      cacheBytes -= entries[e].bytes;
      return entries[e].address;
    }
  }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
 #define ALLOCATION_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #define GRID_ALLOC_SMALL_LIMIT (4096)
 /*Pinning pages is costly*/
@@ -82,22 +84,20 @@ private:
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
  static uint64_t CacheBytes[NallocType];
  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
  static void *AcceleratorAllocate(size_t bytes);
  static void  AcceleratorFree    (void *ptr,size_t bytes);
  static void PrintBytes(void);
 public:
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
  static void  AcceleratorFree    (void *ptr,size_t bytes);
  static void *SharedAllocate(size_t bytes);
  static void  SharedFree    (void *ptr,size_t bytes);
  static void *CpuAllocate(size_t bytes);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -1,12 +1,11 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
 //#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)
 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
 ////////////////////////////////////////////////////////////
@@ -104,7 +103,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  //  dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -112,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    //    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -126,7 +125,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  //  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
@@ -137,7 +136,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    //    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -150,7 +149,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  //  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -165,7 +164,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  //  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -228,24 +227,18 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // Find if present, otherwise get or force an empty
  ////////////////////////////////////////////////////////////////////////////
  if ( EntryPresent(CpuPtr)==0 ){
    EvictVictims(bytes);
    EntryCreate(CpuPtr,bytes,mode,hint);
  }
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
-  if (!AccCache.AccPtr) {
+  
    EvictVictims(bytes); 
  } 
  assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
  assert(AccCache.cpuLock==0);  // Programming error
  if(AccCache.state!=Empty) {
    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
 		    (uint64_t)bytes);
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
@@ -292,21 +285,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
+    //    printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
+    //    printf("Consistent entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
+    //    printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
  } else {
    assert(0);
  }
@@ -368,16 +361,13 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
  // Find if present, otherwise get or force an empty
  ////////////////////////////////////////////////////////////////////////////
  if ( EntryPresent(CpuPtr)==0 ){
    EvictVictims(bytes);
    EntryCreate(CpuPtr,bytes,mode,transient);
  }
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
-
+  
  if (!AccCache.AccPtr) {
     EvictVictims(bytes);
  }
  assert((mode==CpuRead)||(mode==CpuWrite));
  assert(AccCache.accLock==0);  // Programming error
@@ -429,7 +419,6 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
  PrintBytes();
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -1,6 +1,7 @@
 #include <Grid/GridCore.h>
 #ifdef GRID_UVM
 #warning "Grid is assuming unified virtual memory address space"
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////
 // View management is 1:1 address space mapping
--- a/Grid/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@@ -36,7 +36,7 @@ static const int CbBlack=1;
 static const int Even   =CbRed;
 static const int Odd    =CbBlack;
-accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
+accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
 {
  int nd=rdim.size();
  Coordinate coor(nd);
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 bool Stencil_force_mpi = true;
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -1,3 +1,4 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -35,8 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 extern bool Stencil_force_mpi ;
 class CartesianCommunicator : public SharedMemory {
 public:    
@@ -109,8 +108,6 @@ public:
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
  void GlobalMax(RealD &);
  void GlobalMax(RealF &);
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
@@ -141,6 +138,21 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  void SendRecvPacket(void *xmit,
 		      void *recv,
 		      int xmit_to_rank,
 		      int recv_from_rank,
 		      int bytes);
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			   void *xmit,
 			   int xmit_to_rank,
 			   void *recv,
 			   int recv_from_rank,
 			   int bytes);
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -1,6 +1,6 @@
 /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
@@ -35,7 +35,7 @@ Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 ////////////////////////////////////////////
 // First initialise of comms system
 ////////////////////////////////////////////
-void CartesianCommunicator::Init(int *argc, char ***argv)
+void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
  int flag;
@@ -43,16 +43,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
 #ifndef GRID_COMMS_THREADS
    nCommThreads=1;
    // wrong results here too
    // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
    // other comms schemes are ok
    MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
 #else
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-#endif
+
    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
    if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
      assert(0);
@@ -99,7 +91,7 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Initialises from communicator_world
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
-CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
+CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) 
 {
  MPI_Comm optimal_comm;
  ////////////////////////////////////////////////////
@@ -118,7 +110,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 //////////////////////////////////
 // Try to subdivide communicator
 //////////////////////////////////
-CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
+CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)    
 {
  _ndimension = processors.size();  assert(_ndimension>=1);
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
@@ -135,7 +127,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  //  int Nparent = parent._processors ;
+  //  int Nparent = parent._processors ; 
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
@@ -157,13 +149,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
-  int crank;
+  int crank;  
  // Mpi uses the reverse Lexico convention to us; so reversed routines called
  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
  MPI_Comm comm_split;
-  if ( Nchild > 1 ) {
+  if ( Nchild > 1 ) { 
    ////////////////////////////////////////////////////////////////
    // Split the communicator
@@ -188,11 +180,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  SetCommunicator(comm_split);
  ///////////////////////////////////////////////
-  // Free the temp communicator
+  // Free the temp communicator 
  ///////////////////////////////////////////////
  MPI_Comm_free(&comm_split);
-  if(0){
+  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
    for(int d=0;d<processors.size();d++){
      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
@@ -253,7 +245,7 @@ CartesianCommunicator::~CartesianCommunicator()
    for(int i=0;i<communicator_halo.size();i++){
      MPI_Comm_free(&communicator_halo[i]);
    }
-  }
+  }  
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
@@ -275,16 +267,6 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(float &f)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@@ -312,28 +294,60 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
-  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
-
+  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
-  // Enforce no UVM in comms, device or host OK
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-  assert(acceleratorIsCommunicable(xmit));
+    MPI_Request xrq;
-  assert(acceleratorIsCommunicable(recv));
+    MPI_Request rrq;
-  // Give the CPU to MPI immediately; can use threads to overlap optionally
+    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
+    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+    
-		    recv,bytes,MPI_CHAR,from, from,
+    assert(ierr==0);
-		    communicator,MPI_STATUS_IGNORE);
+    list.push_back(xrq);
-  assert(ierr==0);
+    list.push_back(rrq);
-
+  } else { 
-  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+    // Give the CPU to MPI immediately; can use threads to overlap optionally
-  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
+    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
+		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
-// Basic Halo comms primitive
+
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
@@ -353,7 +367,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int from,
 							 int bytes,int dir)
 {
-  int ncomm  =communicator_halo.size();
+  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  MPI_Request xrq;
@@ -368,37 +382,36 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  int tag;
-  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+  if ( gfrom ==MPI_UNDEFINED) {
-    tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
-  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+  if ( gdest == MPI_UNDEFINED ) {
-    tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  } else {
    // TODO : make a OMP loop on CPU, call threaded bcopy
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    acceleratorCopySynchronise(); // MPI prob slower
  }
-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
@@ -409,13 +422,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 //{
 //}
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
@@ -430,8 +436,8 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
  assert(ierr==0);
 }
-int CartesianCommunicator::RankWorld(void){
+int CartesianCommunicator::RankWorld(void){ 
-  int r;
+  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
@@ -464,7 +470,7 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
  // (Turns up on 32^3 x 64 Gparity too)
  MPI_Datatype object;
-  int iwords;
+  int iwords; 
  int ibytes;
  iwords = words;
  ibytes = bytes;
@@ -477,3 +483,5 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t
 }
 NAMESPACE_END(Grid);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -67,8 +67,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 CartesianCommunicator::~CartesianCommunicator(){}
 void CartesianCommunicator::GlobalMax(float &){}
 void CartesianCommunicator::GlobalMax(double &){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
@@ -79,6 +77,15 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int xmit_to_rank,
 					   int recv_from_rank,
 					   int bytes)
 {
  assert(0);
 }
 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -89,6 +96,20 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -116,6 +137,10 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -125,10 +150,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -102,7 +102,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
+  static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);
 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -7,7 +7,6 @@
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -32,12 +31,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
 #endif
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
 #ifdef GRID_SYCl
 #endif
 NAMESPACE_BEGIN(Grid); 
@@ -54,12 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
 #ifndef GRID_MPI3_SHM_NONE
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
 #else
  MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
 #endif
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
@@ -73,7 +61,6 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  WorldNodes = WorldSize/WorldShmSize;
  assert( (WorldNodes * WorldShmSize) == WorldSize );
  // FIXME: Check all WorldShmSize are the same ?
  /////////////////////////////////////////////////////////////////////
@@ -174,23 +161,6 @@ static inline int divides(int a,int b)
 }
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
  ////////////////////////////////////////////////////////////////
  // Allow user to configure through environment variable
  ////////////////////////////////////////////////////////////////
  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
  if ( str ) {
    std::vector<int> IntShmDims;
    GridCmdOptionIntVector(std::string(str),IntShmDims);
    assert(IntShmDims.size() == WorldDims.size());
    long ShmSize = 1;
    for (int dim=0;dim<WorldDims.size();dim++) {
      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
      assert(divides(ShmDims[dim],WorldDims[dim]));
    }
    assert(ShmSize == WorldShmSize);
    return;
  }
  ////////////////////////////////////////////////////////////////
  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
@@ -450,47 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
+#ifdef GRID_CUDA
 //if defined(GRID_SYCL)
 #if 0
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the pointer array for shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  SharedMemoryZero(ShmCommBuf,bytes);
  assert(WorldShmSize == 1);
  for(int r=0;r<WorldShmSize;r++){
    WorldShmCommBufs[r] = ShmCommBuf;
  }
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
 #endif
 #if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)  
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -513,70 +443,37 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-  ShmCommBuf = acceleratorAllocDevice(bytes);
+  auto err =  cudaMalloc(&ShmCommBuf, bytes);
  if ( err !=  cudaSuccess) {
    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
    exit(EXIT_FAILURE);  
  }
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
-  std::cout<< "Setting up IPC"<<std::endl;
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  for(int r=0;r<WorldShmSize;r++){
-
+    
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
    void * thisBuf = ShmCommBuf;
    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; } clone_mem_t;
    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
      if ( err != ZE_RESULT_SUCCESS ) {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
    }
 #endif
 #ifdef GRID_CUDA
    cudaIpcMemHandle_t handle;
    if ( r==WorldShmRank ) { 
-      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
 #ifdef GRID_HIP
    hipIpcMemHandle_t handle;    
    if ( r==WorldShmRank ) { 
      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  hipSuccess) {
 	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
    //////////////////////////////////////////////////
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
@@ -592,68 +489,23 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    ///////////////////////////////////////////////////////////////
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
-
+    void * thisBuf = ShmCommBuf;
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
      std::cout<<"mapping seeking remote pid/fd "
 	       <<handle.pid<<"/"
 	       <<handle.fd<<std::endl;
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
      int myfd  = syscall(438,pidfd,handle.fd,0);
      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
 	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
      }
      assert(thisBuf!=nullptr);
    }
 #endif
 #ifdef GRID_CUDA
    if ( r!=WorldShmRank ) { 
-      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
 #ifdef GRID_HIP
    if ( r!=WorldShmRank ) { 
      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
      if ( err !=  hipSuccess) {
 	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
    ///////////////////////////////////////////////////////////////
    // Save a copy of the device buffers
    ///////////////////////////////////////////////////////////////
    }
    WorldShmCommBufs[r] = thisBuf;
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
  }
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
 #endif
 #else 
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
@@ -781,6 +633,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
@@ -824,16 +677,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 /////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+#ifdef GRID_CUDA
-  acceleratorMemSet(dest,0,bytes);
+  cudaMemset(dest,0,bytes);
 #else
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
 {
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+#ifdef GRID_CUDA
-  acceleratorCopyToDevice(src,dest,bytes);
+  cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
 #else   
  bcopy(src,dest,bytes);
 #endif
@@ -852,11 +705,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
 #ifndef GRID_MPI3_SHM_NONE
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
 #else
  MPI_Comm_split(comm, rank, 0, &ShmComm);
 #endif
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  ShmCommBufs.resize(ShmSize);
@@ -886,18 +735,25 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
-#ifdef GRID_SHM_FORCE_MPI
+#ifdef GRID_IBM_SUMMIT
-  // Hide the shared memory path between ranks
+  // Hide the shared memory path between sockets 
-  {
+  // if even number of nodes
  if ( (ShmSize & 0x1)==0 ) {
    int SocketSize = ShmSize/2;
    int mySocket = ShmRank/SocketSize; 
    for(int r=0;r<size;r++){
-      if ( r!=rank ) {
+      int hisRank=ShmRanks[r];
-	ShmRanks[r] = MPI_UNDEFINED;
+      if ( hisRank!= MPI_UNDEFINED ) {
 	int hisSocket=hisRank/SocketSize;
 	if ( hisSocket != mySocket ) {
 	  ShmRanks[r] = MPI_UNDEFINED;
 	}
      }
    }
  }
 #endif
-  //SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryNone: "
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -56,38 +55,6 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended, use anonymous mmap
 ////////////////////////////////////////////////////////////////////////////////////////////
 #if 1
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
  void * ShmCommBuf ; 
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
    std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  WorldShmCommBufs[0] = ShmCommBuf;
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
 #else
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -116,15 +83,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 };
-#endif
+
 void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
  acceleratorMemSet(dest,0,bytes);
 }
 void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
 {
  acceleratorCopyToDevice(src,dest,bytes);
 }
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/cshift/Cshift.h
+++ b/Grid/cshift/Cshift.h
@@ -52,8 +52,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+template<typename Op, typename T1> 
-auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
+auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
 template <class Op, class T1, class T2>
 auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
 template <class Op, class T1, class T2, class T3>
 auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
 				   eval(0, expr.arg2),
 				   eval(0, expr.arg3)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@@ -73,19 +73,12 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
     }
  }
  {
    autoView(rhs_v , rhs, AcceleratorRead);
    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
-#ifdef ACCELERATOR_CSHIFT    
+    accelerator_for(i,ent,1,{
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for(i,ent,{
      buffer_p[table[i].first]=rhs_v[table[i].second];
    });
 #endif
  }
 }
@@ -110,36 +103,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];
  if ( cbmask ==0x3){
 #ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
-    accelerator_for(nn,e1*e2,1,{
+    accelerator_for2d(n,e1,b,e2,1,{
 	int n = nn%e1;
 	int b = nn/e1;
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for2d(n,e1,b,e2,{
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
 #endif
  } else { 
    autoView(rhs_v , rhs, AcceleratorRead);
    Coordinate rdim=rhs.Grid()->_rdimensions;
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
-#ifdef ACCELERATOR_CSHIFT    
+    accelerator_for2d(n,e1,b,e2,1,{
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(nn,e1*e2,1,{
 	int n = nn%e1;
 	int b = nn/e1;
 	Coordinate coor;
@@ -156,33 +134,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
 #else
    autoView(rhs_v , rhs, CpuRead);
    thread_for2d(n,e1,b,e2,{
 	Coordinate coor;
 	int o=n*n1;
 	int oindex = o+b;
       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
 	int ocb=1<<cb;
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  vobj temp =rhs_v[so+o+b];
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
 #endif
  }
 }
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@@ -224,19 +182,12 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
  }
  {
    autoView( rhs_v, rhs, AcceleratorWrite);
    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
-#ifdef ACCELERATOR_CSHIFT    
+    accelerator_for(i,ent,1,{
-    autoView( rhs_v, rhs, AcceleratorWrite);
+	rhs_v[table[i].first]=buffer_p[table[i].second];
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
 #else
    autoView( rhs_v, rhs, CpuWrite);
    thread_for(i,ent,{
      rhs_v[table[i].first]=buffer_p[table[i].second];
    });
 #endif
  }
 }
@@ -257,32 +208,18 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  int e2=rhs.Grid()->_slice_block[dimension];
  if(cbmask ==0x3 ) {
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
-    accelerator_for(nn,e1*e2,1,{
+    accelerator_for2d(n,e1,b,e2,1,{
-	int n = nn%e1;
+	int o      = n*rhs.Grid()->_slice_stride[dimension];
-	int b = nn/e1;
+	int offset = b+n*rhs.Grid()->_slice_block[dimension];
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      });
 #else
    autoView( rhs_v , rhs, CpuWrite);
    thread_for2d(n,e1,b,e2,{
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
    });
 #endif
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
-    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
+    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
    assert(0); // This will fail if hit on GPU
    autoView( rhs_v, rhs, CpuWrite);
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -340,20 +277,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }
  {
    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
-    accelerator_for(i,ent,vobj::Nsimd(),{
+    auto table = &Cshift_table[0];
-      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
+    accelerator_for(i,ent,1,{
    });
 #else
    autoView(rhs_v , rhs, CpuRead);
    autoView(lhs_v , lhs, CpuWrite);
    thread_for(i,ent,{
      lhs_v[table[i].first]=rhs_v[table[i].second];
    });
 #endif
  }
 }
@@ -392,20 +321,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }
  {
    auto table = &Cshift_table[0];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
    auto table = &Cshift_table[0];
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
 #else
    autoView( rhs_v, rhs, CpuRead);
    autoView( lhs_v, lhs, CpuWrite);
    thread_for(i,ent,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
 #endif
  }
 }
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -101,8 +101,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
-#define ACCELERATOR_CSHIFT_NO_COPY
+
 #ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
@@ -122,9 +121,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
+  commVector<vobj> send_buf(buffer_size);
-  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
+  commVector<vobj> recv_buf(buffer_size);
-    
+
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -139,7 +138,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    } else {
-      int words = buffer_size;
+      int words = send_buf.size();
      if (cbmask != 0x3) words=words>>1;
      int bytes = words * sizeof(vobj);
@@ -151,14 +150,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      grid->Barrier();
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      grid->Barrier();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -198,15 +195,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
  int bytes = buffer_size*sizeof(scalar_object);
@@ -252,204 +242,11 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-	grid->Barrier();
+	grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
-			     (void *)recv_buf_extract_mpi,
+			     (void *)&recv_buf_extract[i][0],
 			     recv_from_rank,
 			     bytes);
 	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
 	rpointers[i] = &send_buf_extract[nbr_lane][0];
      }
    }
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
  }
 }
 #else 
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  GridBase *grid=rhs.Grid();
  Lattice<vobj> temp(rhs.Grid());
  int fd              = rhs.Grid()->_fdimensions[dimension];
  int rd              = rhs.Grid()->_rdimensions[dimension];
  int pd              = rhs.Grid()->_processors[dimension];
  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
  assert(simd_layout==1);
  assert(comm_dim==1);
  assert(shift>=0);
  assert(shift<fd);
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
    grid->ShmBufferFreeAll();
    size_t bytes = buffer_size*sizeof(vobj);
    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
  }
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  for(int x=0;x<rd;x++){       
    int sx        =  (x+sshift)%rd;
    int comm_proc = ((x+sshift)/rd)%pd;
    if (comm_proc==0) {
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
    } else {
      int words = buffer_size;
      if (cbmask != 0x3) words=words>>1;
      int bytes = words * sizeof(vobj);
      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      grid->Barrier();
      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
      grid->Barrier();
      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
    }
  }
 }
 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid=rhs.Grid();
  const int Nsimd = grid->Nsimd();
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  int fd = grid->_fdimensions[dimension];
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int pd = grid->_processors[dimension];
  int simd_layout     = grid->_simd_layout[dimension];
  int comm_dim        = grid->_processors[dimension] >1 ;
  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
  assert(comm_dim==1);
  assert(simd_layout==2);
  assert(shift>=0);
  assert(shift<fd);
  int permute_type=grid->PermuteType(dimension);
  ///////////////////////////////////////////////
  // Simd direction uses an extract/merge pair
  ///////////////////////////////////////////////
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);
  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
    size_t bytes = sizeof(scalar_object)*buffer_size;
    grid->ShmBufferFreeAll();
    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
  }
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
  int bytes = buffer_size*sizeof(scalar_object);
  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
  ///////////////////////////////////////////
  // Work out what to send where
  ///////////////////////////////////////////
  int cb    = (cbmask==0x2)? Odd : Even;
  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  // loop over outer coord planes orthog to dim
  for(int x=0;x<rd;x++){       
    // FIXME call local permute copy if none are offnode.
    for(int i=0;i<Nsimd;i++){       
      pointers[i] = &send_buf_extract[i][0];
    }
    int sx   = (x+sshift)%rd;
    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
    for(int i=0;i<Nsimd;i++){
      int inner_bit = (Nsimd>>(permute_type+1));
      int ic= (i&inner_bit)? 1:0;
      int my_coor          = rd*ic + x;
      int nbr_coor         = my_coor+sshift;
      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
      int nbr_lane = (i&(~inner_bit));
      int recv_from_rank;
      int xmit_to_rank;
      if (nbr_ic) nbr_lane|=inner_bit;
      assert (sx == nbr_ox);
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	grid->Barrier();
 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
 	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -461,7 +258,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  }
 }
-#endif
+
 NAMESPACE_END(Grid); 
 #endif
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -36,8 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-#include <Grid/lattice/Lattice_reality.h>
+//#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
 #include <Grid/lattice/Lattice_coordinate.h>
@@ -46,4 +45,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -42,24 +42,9 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // Predicated where support
 ////////////////////////////////////////////////////
 #ifdef GRID_SIMT
 // drop to scalar in SIMT; cleaner in fact
 template <class iobj, class vobj, class robj>
-accelerator_inline vobj predicatedWhere(const iobj &predicate, 
+accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
-					const vobj &iftrue, 
+                            const robj &iffalse) {
 					const robj &iffalse) 
 {
  Integer mask = TensorRemove(predicate);
  typename std::remove_const<vobj>::type ret= iffalse;
  if (mask) ret=iftrue;
  return ret;
 }
 #else
 template <class iobj, class vobj, class robj>
 accelerator_inline vobj predicatedWhere(const iobj &predicate, 
 					const vobj &iftrue, 
 					const robj &iffalse) 
 {
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
@@ -83,7 +68,6 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
  merge(ret, falsevals);
  return ret;
 }
 #endif
 /////////////////////////////////////////////////////
 //Specialization of getVectorType for lattices
@@ -97,62 +81,32 @@ struct getVectorType<Lattice<T> >{
 //--  recursive evaluation of expressions; --
 // handle leaves of syntax tree
 ///////////////////////////////////////////////////
-template<class sobj,
+template<class sobj> accelerator_inline 
  typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr> 
 accelerator_inline 
 sobj eval(const uint64_t ss, const sobj &arg)
 {
  return arg;
 }
 template <class lobj> accelerator_inline 
 auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
 {
  return arg(ss);
 }
 ////////////////////////////////////////////
 //--  recursive evaluation of expressions; --
 // whole vector return, used only for expression return type inference
 ///////////////////////////////////////////////////
 template<class sobj> accelerator_inline 
 sobj vecEval(const uint64_t ss, const sobj &arg)
 {
  return arg;
 }
 template <class lobj> accelerator_inline 
-const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg) 
+const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) 
 {
  return arg[ss];
 }
-///////////////////////////////////////////////////
+// What needs this?
-// handle nodes in syntax tree- eval one operand
+// Cannot be legal on accelerator
-// vecEval needed (but never called as all expressions offloaded) to infer the return type
+// Comparison must convert
-// in SIMT contexts of closure.
+#if 1
-///////////////////////////////////////////////////
+template <class lobj> accelerator_inline 
-template <typename Op, typename T1> accelerator_inline 
+const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
 auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
  -> decltype(expr.op.func( vecEval(ss, expr.arg1)))
 {
-  return expr.op.func( vecEval(ss, expr.arg1) );
+  auto view = arg.View(AcceleratorRead);
-}
+  return view[ss];
 // vecEval two operands
 template <typename Op, typename T1, typename T2> accelerator_inline
 auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
  -> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
 {
  return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
 }
 // vecEval three operands
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
  -> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
 {
  return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
 }
 #endif
 ///////////////////////////////////////////////////
-// handle nodes in syntax tree- eval one operand coalesced
+// handle nodes in syntax tree- eval one operand
 ///////////////////////////////////////////////////
 template <typename Op, typename T1> accelerator_inline 
 auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
@@ -160,41 +114,23 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
 {
  return expr.op.func( eval(ss, expr.arg1) );
 }
 ///////////////////////
 // eval two operands
 ///////////////////////
 template <typename Op, typename T1, typename T2> accelerator_inline
 auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
  -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
 {
  return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
 }
 ///////////////////////
 // eval three operands
 ///////////////////////
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
-  -> decltype(expr.op.func(eval(ss, expr.arg1), 
+  -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
 			   eval(ss, expr.arg2), 
 			   eval(ss, expr.arg3)))
 {
-#ifdef GRID_SIMT
+  return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
  // Handles Nsimd (vInteger) != Nsimd(ComplexD)
  typedef decltype(vecEval(ss, expr.arg2)) rvobj;
  typedef typename std::remove_reference<rvobj>::type vobj;
  const int Nsimd = vobj::vector_type::Nsimd();
  auto vpred = vecEval(ss,expr.arg1);
  ExtractBuffer<Integer> mask(Nsimd);
  extract<vInteger, Integer>(TensorRemove(vpred), mask);
  int s = acceleratorSIMTlane(Nsimd);
  return expr.op.func(mask[s],
 		      eval(ss, expr.arg2), 
 		      eval(ss, expr.arg3));
 #else
  return expr.op.func(eval(ss, expr.arg1),
 		      eval(ss, expr.arg2), 
 		      eval(ss, expr.arg3));
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@@ -292,7 +228,7 @@ template <typename Op, typename T1, typename T2> inline
 void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr) 
 {
  ExpressionViewOpen(expr.arg1);  // recurse AST
-  ExpressionViewOpen(expr.arg2);  // rrecurse AST
+  ExpressionViewOpen(expr.arg2);  // recurse AST
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
@@ -336,20 +272,28 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
 // Unary operators and funcs
 ////////////////////////////////////////////
 #define GridUnopClass(name, ret)					\
  template <class arg>							\
  struct name {								\
-    template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
+    static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
  };
 GridUnopClass(UnarySub, -a);
 GridUnopClass(UnaryNot, Not(a));
 GridUnopClass(UnaryAdj, adj(a));
 GridUnopClass(UnaryConj, conjugate(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
 GridUnopClass(UnaryReal, real(a));
 GridUnopClass(UnaryImag, imag(a));
 GridUnopClass(UnaryToReal, toReal(a));
 GridUnopClass(UnaryToComplex, toComplex(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
 GridUnopClass(UnarySqrt, sqrt(a));
 GridUnopClass(UnaryRsqrt, rsqrt(a));
 GridUnopClass(UnarySin, sin(a));
 GridUnopClass(UnaryCos, cos(a));
 GridUnopClass(UnaryAsin, asin(a));
@@ -361,10 +305,10 @@ GridUnopClass(UnaryExp, exp(a));
 // Binary operators
 ////////////////////////////////////////////
 #define GridBinOpClass(name, combination)			\
  template <class left, class right>				\
  struct name {							\
    template <class _left, class _right>			\
    static auto accelerator_inline				\
-    func(const _left &lhs, const _right &rhs)			\
+    func(const left &lhs, const right &rhs)			\
      -> decltype(combination) const				\
    {								\
      return combination;					\
@@ -384,10 +328,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
 // Trinary conditional op
 ////////////////////////////////////////////////////
 #define GridTrinOpClass(name, combination)				\
  template <class predicate, class left, class right>			\
  struct name {								\
    template <class _predicate,class _left, class _right>		\
    static auto accelerator_inline					\
-    func(const _predicate &pred, const _left &lhs, const _right &rhs)	\
+    func(const predicate &pred, const left &lhs, const right &rhs)	\
      -> decltype(combination) const					\
    {									\
      return combination;						\
@@ -395,17 +339,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
  };
 GridTrinOpClass(TrinaryWhere,
-		(predicatedWhere<
+		(predicatedWhere<predicate, 
-		 typename std::remove_reference<_predicate>::type, 
+		 typename std::remove_reference<left>::type,
-		 typename std::remove_reference<_left>::type,
+		 typename std::remove_reference<right>::type>(pred, lhs,rhs)));
 		 typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
-#define GRID_UNOP(name)   name
+
-#define GRID_BINOP(name)  name
+#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
-#define GRID_TRINOP(name) name
+#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_DEF_UNOP(op, name)						\
  template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@@ -451,17 +395,22 @@ GridTrinOpClass(TrinaryWhere,
 GRID_DEF_UNOP(operator-, UnarySub);
 GRID_DEF_UNOP(Not, UnaryNot);
 GRID_DEF_UNOP(operator!, UnaryNot);
-//GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(adj, UnaryAdj);
-//GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
 GRID_DEF_UNOP(real, UnaryReal);
 GRID_DEF_UNOP(imag, UnaryImag);
 GRID_DEF_UNOP(toReal, UnaryToReal);
 GRID_DEF_UNOP(toComplex, UnaryToComplex);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
                               // abs-fabs-dabs-labs thing
 GRID_DEF_UNOP(sqrt, UnarySqrt);
 GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
 GRID_DEF_UNOP(sin, UnarySin);
 GRID_DEF_UNOP(cos, UnaryCos);
 GRID_DEF_UNOP(asin, UnaryAsin);
@@ -486,36 +435,29 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 template <class Op, class T1>
 auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > 
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
 {
-  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
+  Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2>
 auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
 {
-  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
+  Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2, class T3>
 auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
-				   vecEval(0, expr.arg2),
+				   eval(0, expr.arg2),
-				   vecEval(0, expr.arg3)))>::type >
+				   eval(0, expr.arg3)))> 
 {
-  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
+  Lattice<decltype(expr.op.func(eval(0, expr.arg1),
-				vecEval(0, expr.arg2),
+				eval(0, expr.arg2),
-			        vecEval(0, expr.arg3)))>::type >  ret(expr);
+				eval(0, expr.arg3)))>  ret(expr);
  return ret;
 }
 #define EXPRESSION_CLOSURE(function)					\
  template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
    auto function(Expression &expr) -> decltype(function(closure(expr))) \
  {									\
    return function(closure(expr));					\
  }
 #undef GRID_UNOP
 #undef GRID_BINOP
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  autoView( lhs_v , lhs, AcceleratorRead);
  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    auto rhs_t=rhs_v(ss);
    auto tmp  =ret_v(ss);
    mac(&tmp,&lhs_t,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
  });
@@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    auto tmp  =ret_v(ss);
+    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    mac(&tmp,&lhs_t,&rhs);
    coalescedWrite(ret_v[ss],tmp);
@@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
-    auto tmp  =ret_v(ss);
+    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
    mac(&tmp,&lhs,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
@@ -225,7 +225,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
  autoView( x_v , x, AcceleratorRead);
  autoView( y_v , y, AcceleratorRead);
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
-    auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]);
+    auto tmp = a*x_v(ss)+y_v(ss);
    coalescedWrite(ret_v[ss],tmp);
  });
 }
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -123,9 +123,9 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
    auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),vobj::Nsimd(),{
+    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,exprCopy);
-      coalescedWrite(me[ss],tmp);
+      vstream(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
@@ -146,9 +146,9 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
    auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),vobj::Nsimd(),{
+    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,exprCopy);
-      coalescedWrite(me[ss],tmp);
+      vstream(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
@@ -168,9 +168,9 @@ public:
    auto exprCopy = expr;
    ExpressionViewOpen(exprCopy);
    auto me  = View(AcceleratorWriteDiscard);
-    accelerator_for(ss,me.size(),vobj::Nsimd(),{
+    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,exprCopy);
-      coalescedWrite(me[ss],tmp);
+      vstream(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -54,34 +54,13 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  typedef decltype(basis[0].View(AcceleratorRead)) View;
  Vector<View> basis_v; basis_v.reserve(basis.size());
  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
  GridBase* grid = basis[0].Grid();
  for(int k=0;k<basis.size();k++){
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }
-#if ( (!defined(GRID_CUDA)) )
+
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
    {
      vobj* B = &Bt[Nm * thread_num()];
      thread_for_in_region(ss, grid->oSites(),{
 	  for(int j=j0; j<j1; ++j) B[j]=0.;
 	  for(int j=j0; j<j1; ++j){
 	    for(int k=k0; k<k1; ++k){
 	      B[j] +=Qt(j,k) * basis_v[k][ss];
 	    }
 	  }
 	  for(int j=j0; j<j1; ++j){
 	    basis_v[j][ss] = B[j];
 	  }
 	});
    }
 #else
  View *basis_vp = &basis_v[0];
  int nrot = j1-j0;
@@ -91,12 +70,14 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];
  // GPU readable copy of matrix
-  Vector<Coeff_t> Qt_jv(Nm*Nm);
+  Vector<double> Qt_jv(Nm*Nm);
-  Coeff_t *Qt_p = & Qt_jv[0];
+  double *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
@@ -125,7 +106,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	for(int k=k0; k<k1; ++k){
 	  auto tmp = coalescedRead(Bp[ss*nrot+j]);
-	  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
+	  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
 	}
      });
@@ -134,10 +115,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	int jj  =j0+j;
 	int ss =sj/nrot;
 	int sss=ss+s;
-	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
+	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
 #endif
  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }
@@ -161,13 +141,11 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
  auto basis_vp=& basis_v[0];
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
-    vobj zzz=Zero();
+    auto B=coalescedRead(zz);
    auto B=coalescedRead(zzz);
    for(int k=k0; k<k1; ++k){
-      B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
+      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
    }
    coalescedWrite(result_v[ss], B);
  });
--- a/Grid/lattice/Lattice_comparison.h
+++ b/Grid/lattice/Lattice_comparison.h
@@ -42,6 +42,34 @@ NAMESPACE_BEGIN(Grid);
 typedef iScalar<vInteger> vPredicate ;
 /*
 template <class iobj, class vobj, class robj> accelerator_inline 
 vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse) 
 {
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = vobj::vector_type::Nsimd();
  ExtractBuffer<Integer> mask(Nsimd);
  ExtractBuffer<scalar_object> truevals(Nsimd);
  ExtractBuffer<scalar_object> falsevals(Nsimd);
  extract(iftrue, truevals);
  extract(iffalse, falsevals);
  extract<vInteger, Integer>(TensorRemove(predicate), mask);
  for (int s = 0; s < Nsimd; s++) {
    if (mask[s]) falsevals[s] = truevals[s];
  }
  merge(ret, falsevals);
  return ret;
 }
 */
 //////////////////////////////////////////////////////////////////////////
 // compare lattice to lattice
 //////////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -1,55 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
  typedef typename vobj::tensor_reduced normtype;
  typedef typename normtype::scalar_object scalar;
  std::vector<scalar> sff;
  sliceSum(ff,sff,mu);
  for(int t=0;t<sff.size();t++){
    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
  }
 }
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -182,14 +182,6 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  return;
 };
 template<class vobj,class sobj>
 inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
 {
  autoView(lv,l,CpuRead);
  peekLocalSite(s,lv,site);
  return;
 };
 // Must be CPU write view
 template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
@@ -218,14 +210,6 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  return;
 };
 template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
 {
  autoView(lv,l,CpuWrite);
  pokeLocalSite(s,lv,site);
  return;
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_real_imag.h
+++ b/Grid/lattice/Lattice_real_imag.h
@@ -1,79 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reality.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_REAL_IMAG_H
 #define GRID_LATTICE_REAL_IMAG_H
 // FIXME .. this is the sector of the code 
 // I am most worried about the directions
 // The choice of burying complex in the SIMD
 // is making the use of "real" and "imag" very cumbersome
 NAMESPACE_BEGIN(Grid);
 template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] =real(lhs_v[ss]);
  });
  return ret;
 };
 template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] =imag(lhs_v[ss]);
  });
  return ret;
 };
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
  auto real(const Expression &expr) -> decltype(real(closure(expr)))		
 {									
  return real(closure(expr));					
 }
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
  auto imag(const Expression &expr) -> decltype(imag(closure(expr)))		
 {									
  return imag(closure(expr));					
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
-  accelerator_for( ss, lhs_v.size(), 1, {
+  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
-     ret_v[ss] = adj(lhs_v[ss]);
+    coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
  });
  return ret;
 };
@@ -64,53 +64,6 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
  return ret;
 };
 template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
  Lattice<typename vobj::Complexified> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard() = lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] = toComplex(lhs_v[ss]);
  });
  return ret;
 };
 template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
  Lattice<typename vobj::Realified> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard() = lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] = toReal(lhs_v[ss]);
  });
  return ret;
 };
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto toComplex(const Expression &expr)  -> decltype(closure(expr)) 
 {
  return toComplex(closure(expr));
 }
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto toReal(const Expression &expr)  -> decltype(closure(expr)) 
 {
  return toReal(closure(expr));
 }
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto adj(const Expression &expr)  -> decltype(closure(expr)) 
 {
  return adj(closure(expr));
 }
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
 auto conjugate(const Expression &expr)  -> decltype(closure(expr)) 
 {
  return conjugate(closure(expr));
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -96,34 +96,8 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  ssobj ret = ssum;
  return ret;
 }
 /*
 Threaded max, don't use for now
 template<class Double>
 inline Double max(const Double *arg, Integer osites)
 {
  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();
-  std::vector<Double> maxarray(nthread);
+
  thread_for(thr,nthread, {
    int nwork, mywork, myoff;
    nwork = osites;
    GridThread::GetWork(nwork,thr,mywork,myoff);
    Double max=arg[0];
    for(int ss=myoff;ss<mywork+myoff; ss++){
      if( arg[ss] > max ) max = arg[ss];
    }
    maxarray[thr]=max;
  });
  Double tmax=maxarray[0];
  for(int i=0;i<nthread;i++){
    if (maxarray[i]>tmax) tmax = maxarray[i];
  } 
  return tmax;
 }
 */
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
@@ -167,32 +141,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }
 //The global maximum of the site norm2
 template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 {
  typedef typename vobj::tensor_reduced vscalar;  //iScalar<iScalar<.... <vPODtype> > >
  typedef typename vscalar::scalar_object  scalar;   //iScalar<iScalar<.... <PODtype> > >
  Lattice<vscalar> inner = localNorm2(arg);
  auto grid = arg.Grid();
  RealD max;
  for(int l=0;l<grid->lSites();l++){
    Coordinate coor;
    scalar val;
    RealD r;
    grid->LocalIndexToLocalCoor(l,coor);
    peekLocalSite(val,inner,coor);
    r=real(TensorRemove(val));
    if( (l==0) || (r>max)){
      max=r;
    }
  }
  grid->GlobalMax(max);
  return max;
 }
 // Double inner product
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -361,7 +309,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  // But easily avoided by using double precision fields
  ///////////////////////////////////////////////////////
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_object::scalar_type scalar_type;
  GridBase  *grid = Data.Grid();
  assert(grid!=NULL);
@@ -420,19 +367,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  }
  // sum over nodes.
  sobj gsum;
  for(int t=0;t<fd;t++){
    int pt = t/ld; // processor plane
    int lt = t%ld;
    if ( pt == grid->_processor_coor[orthogdim] ) {
-      result[t]=lsSum[lt];
+      gsum=lsSum[lt];
    } else {
-      result[t]=Zero();
+      gsum=Zero();
    }
    grid->GlobalSum(gsum);
    result[t]=gsum;
  }
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
 template<class vobj>
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -2,13 +2,12 @@ NAMESPACE_BEGIN(Grid);
 #ifdef GRID_HIP
 extern hipDeviceProp_t *gpu_props;
 #define WARP_SIZE 64
 #endif
 #ifdef GRID_CUDA
 extern cudaDeviceProp *gpu_props;
 #define WARP_SIZE 32
 #endif
 #define WARP_SIZE 32
 __device__ unsigned int retirementCount = 0;
 template <class Iterator>
@@ -65,7 +64,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
  // cannot use overloaded operators for sobj as they are not volatile-qualified
  memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
-  acceleratorSynchronise();
+  __syncwarp();
  const Iterator VEC = WARP_SIZE;
  const Iterator vid = tid & (VEC-1);
@@ -79,9 +78,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
      beta += temp;
      memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
    }
-    acceleratorSynchronise();
+    __syncwarp();
  }
-  acceleratorSynchroniseAll();
+  __syncthreads();
  if (threadIdx.x == 0) {
    beta  = Zero();
@@ -91,7 +90,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
    }
    memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
  }
-  acceleratorSynchroniseAll();
+  __syncthreads();
 }
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -32,9 +32,8 @@
 #include <random>
 #ifdef RNG_SITMO
-#include <Grid/random/sitmo_prng_engine.hpp>
+#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
 #endif 
 #include <Grid/random/gaussian.h>
 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -143,7 +142,7 @@ public:
  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<Grid::gaussian_distribution<RealD> >    _gaussian;
+  std::vector<std::normal_distribution<RealD> >       _gaussian;
  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;
@@ -244,7 +243,7 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }
@@ -358,7 +357,7 @@ public:
    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -97,20 +97,6 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
  out = in;
 }
 template<typename T>
 accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
  out = in;
 }
 // This would allow for conversions between GridFundamental types, but is not strictly needed as yet
 /*template<typename T1, typename T2>
 accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
 // Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
 //accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
 convertType(T1 & out, const T2 & in) {
  out = in;
 }*/
 #ifdef GRID_SIMT
 accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
  ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
@@ -131,18 +117,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
 }
-template<typename T1,typename T2>
+template<typename T1,typename T2,int N>
-accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) {
+  accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
-  convertType(out._internal,in._internal);
+template<typename T1,typename T2,int N>
-}
+  accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
-template<typename T1,typename T2>
+template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
-accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) {
+accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
  convertType(out,in._internal);
 }
 template<typename T1,typename T2>
-accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) {
+accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
  convertType(out._internal,in);
 }
@@ -159,6 +145,11 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
    convertType(out._internal[i],in._internal[i]);
 }
 template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
 accelerator_inline void convertType(T & out, const T & in) {
  out = in;
 }
 template<typename T1,typename T2>
 accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
  autoView( out_v , out,AcceleratorWrite);
@@ -249,8 +240,6 @@ template<class vobj,class vobj2,class CComplex>
  autoView( fineX_  , fineX, AcceleratorRead);
  autoView( fineY_  , fineY, AcceleratorRead);
  autoView( coarseA_, coarseA, AcceleratorRead);
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;
  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@@ -258,9 +247,9 @@ template<class vobj,class vobj2,class CComplex>
      Coordinate coor_c(_ndimension);
      Coordinate coor_f(_ndimension);
-      Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
+      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-      Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
+      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
      // z = A x + y
 #ifdef GRID_SIMT
@@ -364,22 +353,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( fineData_   , fineData, AcceleratorRead);
  auto coarseData_p = &coarseData_[0];
  auto fineData_p = &fineData_[0];
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;
  vobj zz = Zero();
  accelerator_for(sc,coarse->oSites(),1,{
      // One thread per sub block
      Coordinate coor_c(_ndimension);
-      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
+      Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
      coarseData_[sc]=Zero();
      vobj cd = zz;
      for(int sb=0;sb<blockVol;sb++){
 	int sf;
@@ -387,13 +367,11 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 	Coordinate coor_f(_ndimension);
 	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
-	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
+	Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
-	cd=cd+fineData_p[sf];
+	coarseData_[sc]=coarseData_[sc]+fineData_[sf];
      }
      coarseData_p[sc] = cd;
    });
  return;
 }
@@ -785,7 +763,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@@ -1010,95 +988,53 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
 //The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
 class precisionChangeWorkspace{
  std::pair<Integer,Integer>* fmap_device; //device pointer
 public:
  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
    assert(out_grid->Nd() == in_grid->Nd());
    for(int d=0;d<out_grid->Nd();d++){
      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
    }
    int Nsimd_out = out_grid->Nsimd();
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another
 //Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
 template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
+{
-  precisionChange(out, in, workspace);
+  assert(out.Grid()->Nd() == in.Grid()->Nd());
-}
+  for(int d=0;d<out.Grid()->Nd();d++){
    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
  }
  out.Checkerboard() = in.Checkerboard();
  GridBase *in_grid=in.Grid();
  GridBase *out_grid = out.Grid();
  typedef typename VobjOut::scalar_object SobjOut;
  typedef typename VobjIn::scalar_object SobjIn;
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
    out_icoor[lane].resize(ndim);
    out_grid->iCoorFromIindex(out_icoor[lane], lane);
  }
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
  autoView( out_v , out, CpuWrite);
  thread_for(out_oidx,out_grid->oSites(),{
    Coordinate out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
    Coordinate lcoor(out_grid->Nd());
    for(int lane=0; lane < out_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
      ptrs[lane] = &in_slex_conv[llex];
    }
    merge(out_v[out_oidx], ptrs, 0);
  });
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -67,14 +67,9 @@ public:
  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 #endif
 #if 1
  //  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
  accelerator_inline vobj       & operator[](size_t i) const { return this->_odata[i]; };
 #else
  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
-#endif
+
  accelerator_inline uint64_t begin(void) const { return 0;};
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
--- a/Grid/lattice/Lattice_where.h
+++ b/Grid/lattice/Lattice_where.h
@@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  conformable(iftrue,predicate);
  conformable(iftrue,ret);
-  GridBase *grid=iftrue.Grid();
+  GridBase *grid=iftrue._grid;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
@@ -52,23 +52,22 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  const int Nsimd = grid->Nsimd();
-  autoView(iftrue_v,iftrue,CpuRead);
+  std::vector<Integer> mask(Nsimd);
-  autoView(iffalse_v,iffalse,CpuRead);
+  std::vector<scalar_object> truevals (Nsimd);
-  autoView(predicate_v,predicate,CpuRead);
+  std::vector<scalar_object> falsevals(Nsimd);
-  autoView(ret_v,ret,CpuWrite);
+
-  Integer NN= grid->oSites();
+  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
-  thread_for(ss,NN,{
+
-    Integer mask;
+    extract(iftrue._odata[ss]   ,truevals);
-    scalar_object trueval;
+    extract(iffalse._odata[ss]  ,falsevals);
-    scalar_object falseval;
+    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
-    for(int l=0;l<Nsimd;l++){
+
-      trueval =extractLane(l,iftrue_v[ss]);
+    for(int s=0;s<Nsimd;s++){
-      falseval=extractLane(l,iffalse_v[ss]);
+      if (mask[s]) falsevals[s]=truevals[s];
      mask    =extractLane(l,predicate_v[ss]);
      if (mask) falseval=trueval;
      insertLane(l,ret_v[ss],falseval);
    }
-  });
+
    merge(ret._odata[ss],falsevals);
  }
 }
 template<class vobj,class iobj>
@@ -77,9 +76,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
-  Lattice<vobj> ret(iftrue.Grid());
+  Lattice<vobj> ret(iftrue._grid);
-  whereWolf(ret,predicate,iftrue,iffalse);
+  where(ret,predicate,iftrue,iffalse);
  return ret;
 }
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -69,7 +69,6 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@@ -80,7 +79,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@@ -89,8 +87,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -130,8 +130,6 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
      std::ios_base::fmtflags f(stream.flags());
      stream << log.background()<<  std::left;
      if (log.topWidth > 0)
      {
@@ -154,8 +152,6 @@ public:
 	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
      stream.flags(f);
      return stream;
    } else { 
      return devnull;
@@ -182,7 +178,6 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -1,4 +1,3 @@
 #include <Grid/GridCore.h>
-int                    Grid::BinaryIO::latticeWriteMaxRetry = -1;
+int Grid::BinaryIO::latticeWriteMaxRetry = -1;
 Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -79,13 +79,6 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
  struct IoPerf
  {
    uint64_t size{0},time{0};
    double   mbytesPerSecond{0.};
  };
  static IoPerf lastPerf;
  static int latticeWriteMaxRetry;
  /////////////////////////////////////////////////////////////////////////////
@@ -509,15 +502,12 @@ class BinaryIO {
      timer.Stop();
    }
    lastPerf.size            = sizeof(fobj)*iodata.size()*nrank;
    lastPerf.time            = timer.useconds();
    lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
+    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
@@ -673,15 +663,10 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);
    timer.Start();
-    thread_for(lidx,lsites,{  // FIX ME, suboptimal implementation
+    thread_for(lidx,lsites,{
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
-      Coordinate lcoor;
+      parallel_rng.SetState(tmp,lidx);
      grid->LocalIndexToLocalCoor(lidx, lcoor);
      int o_idx=grid->oIndex(lcoor);
      int i_idx=grid->iIndex(lcoor);
      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
      parallel_rng.SetState(tmp,gidx);
      });
    timer.Stop();
@@ -738,12 +723,7 @@ class BinaryIO {
    std::vector<RNGstate> iodata(lsites);
    thread_for(lidx,lsites,{
      std::vector<RngStateType> tmp(RngStateCount);
-      Coordinate lcoor;
+      parallel_rng.GetState(tmp,lidx);
      grid->LocalIndexToLocalCoor(lidx, lcoor);
      int o_idx=grid->oIndex(lcoor);
      int i_idx=grid->iIndex(lcoor);
      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
      parallel_rng.GetState(tmp,gidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
    });
    timer.Stop();
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
 ////////////////////////////////////////////////////////////
 // Helper to fill out metadata
 ////////////////////////////////////////////////////////////
-template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
 					  scidacFile   & _scidacFile) 
@@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  ////////////////////////////////////////////////////////////////
-  template <class stats = PeriodicGaugeStatistics>
+  template <class vsimd>
-  void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description) 
+  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
  {
    GridBase * grid = Umu.Grid();
-    typedef Lattice<vLorentzColourMatrixD> GaugeField;
+    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef vLorentzColourMatrixD vobj;
+    typedef iLorentzColourMatrix<vsimd> vobj;
    typedef typename vobj::scalar_object sobj;
    ////////////////////////////////////////
@@ -636,9 +636,6 @@ class IldgWriter : public ScidacWriter {
    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
    stats Stats;
    Stats(Umu,header);
    std::string format = header.floating_point;
    header.ensemble_id    = description;
    header.ensemble_label = description;
@@ -708,10 +705,10 @@ class IldgReader : public GridLimeReader {
  // Else use ILDG MetaData object if present.
  // Else use SciDAC MetaData object if present.
  ////////////////////////////////////////////////////////////////
-  template <class stats = PeriodicGaugeStatistics>
+  template <class vsimd>
-  void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {
+  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
-    typedef Lattice<vLorentzColourMatrixD > GaugeField;
+    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef typename GaugeField::vector_object  vobj;
    typedef typename vobj::scalar_object sobj;
@@ -924,8 +921,7 @@ class IldgReader : public GridLimeReader {
    if ( found_FieldMetaData || found_usqcdInfo ) {
      FieldMetaData checker;
-      stats Stats;
+      GaugeStatistics(Umu,checker);
      Stats(Umu,checker);
      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
  std::time_t t = std::time(nullptr);
  std::tm tm_ = *std::localtime(&t);
  std::ostringstream oss; 
-  oss << std::put_time(&tm_, "%c %Z");
+  //      oss << std::put_time(&tm_, "%c %Z");
  header.creation_date = oss.str();
  header.archive_date  = header.creation_date;
@@ -176,18 +176,29 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
  GridMetaData(grid,header); 
  MachineCharacteristics(header);
 }
-template<class Impl>
+inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
 class GaugeStatistics
 {
-public:
+  // How to convert data precision etc...
-  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
+  header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
-  {
+  header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
-    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+}
-    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
-  }
+{
-};
+  // How to convert data precision etc...
-typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
+  header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
-typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
+  header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
 {
  GridBase *grid = field.Grid();
  std::string format = getFormatString<vLorentzColourMatrixF>();
  header.floating_point = format;
  header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
  GridMetaData(grid,header); 
  GaugeStatistics(field,header);
  MachineCharacteristics(header);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
 {
  GridBase *grid = field.Grid();
@@ -195,6 +206,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
  header.floating_point = format;
  header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
  GridMetaData(grid,header); 
  GaugeStatistics(field,header);
  MachineCharacteristics(header);
 }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -39,10 +39,6 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
@@ -133,12 +129,12 @@ public:
  // Now the meat: the object readers
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class GaugeStats=PeriodicGaugeStatistics>
+  template<class vsimd>
-  static inline void readConfiguration(GaugeField &Umu,
+  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 				       FieldMetaData& header,
-				       std::string file,
+				       std::string file)
 				       GaugeStats GaugeStatisticsCalculator=GaugeStats())
  {
    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    GridBase *grid = Umu.Grid();
    uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -157,23 +153,23 @@ public:
    // munger is a function of <floating point, Real, data_type>
    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
+	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
      if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D> 
+	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
+	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
      if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
+	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
@@ -181,7 +177,7 @@ public:
      assert(0);
    }
-    GaugeStats Stats; Stats(Umu,clone);
+    GaugeStatistics(Umu,clone);
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	     <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -200,29 +196,22 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }
-  // Preferred interface
+  template<class vsimd>
-  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					std::string ens_label = std::string("DWF"))
  {
    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
-					int bits32,
+					int bits32)
 					std::string ens_label = std::string("DWF"))
  {
-    typedef vLorentzColourMatrixD vobj;
+    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef iLorentzColourMatrix<vsimd> vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
@@ -230,8 +219,8 @@ public:
    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
-    header.ensemble_id     = std::string("UKQCD");
+    header.ensemble_id     = "UKQCD";
-    header.ensemble_label  = ens_label;
+    header.ensemble_label  = "DWF";
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -240,28 +229,28 @@ public:
    GridMetaData(grid,header);
    assert(header.nd==4);
-    GaugeStats Stats; Stats(Umu,header);
+    GaugeStatistics(Umu,header);
    MachineCharacteristics(header);
-    uint64_t offset;
+	uint64_t offset;
    // Sod it -- always write 3x3 double
    header.floating_point = std::string("IEEE64BIG");
    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
    GaugeSimpleUnmunger<fobj3D,sobj> munge;
-    if ( grid->IsBoss() ) { 
+	if ( grid->IsBoss() ) { 
-      truncate(file);
+	  truncate(file);
-      offset = writeHeader(header,file);
+    offset = writeHeader(header,file);
-    }
+	}
-    grid->Broadcast(0,(void *)&offset,sizeof(offset));
+	grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 					      nersc_csum,scidac_csuma,scidac_csumb);
    header.checksum = nersc_csum;
-    if ( grid->IsBoss() ) { 
+	if ( grid->IsBoss() ) { 
-      writeHeader(header,file);
+    writeHeader(header,file);
-    }
+	}
    std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 	     <<std::hex<<header.checksum
--- a/Grid/parallelIO/OpenQcdIO.h
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -154,7 +154,7 @@ public:
    grid->Barrier(); timer.Stop();
    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
-    PeriodicGaugeStatistics Stats; Stats(Umu, clone);
+    GaugeStatistics(Umu, clone);
    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
--- a/Grid/parallelIO/OpenQcdIOChromaReference.h
+++ b/Grid/parallelIO/OpenQcdIOChromaReference.h
@@ -208,7 +208,7 @@ public:
    FieldMetaData clone(header);
-    PeriodicGaugeStatistics Stats; Stats(Umu, clone);
+    GaugeStatistics(Umu, clone);
    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
 static constexpr int Zm = 6;
 static constexpr int Tm = 7;
-static constexpr int Nc=Config_Nc;
+static constexpr int Nc=3;
 static constexpr int Ns=4;
 static constexpr int Nd=4;
 static constexpr int Nhs=2; // half spinor
@@ -63,7 +63,6 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -81,15 +80,6 @@ template<typename T> struct isSpinor {
 template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
 template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
 const int CoarseIndex = 4;
 template<typename T> struct isCoarsened {
   static constexpr bool value = (CoarseIndex<=T::TensorLevel);
 };
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -104,7 +94,6 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
 template<typename vtype> using iLorentzVector             = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@@ -114,10 +103,8 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -164,16 +151,7 @@ typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
-// LorentzVector
+// LorentzColour
 typedef iLorentzVector<Complex  > LorentzVector;
 typedef iLorentzVector<ComplexF > LorentzVectorF;
 typedef iLorentzVector<ComplexD > LorentzVectorD;
 typedef iLorentzVector<vComplex > vLorentzVector;
 typedef iLorentzVector<vComplexF> vLorentzVectorF;
 typedef iLorentzVector<vComplexD> vLorentzVectorD;
 // LorentzColourMatrix
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
 typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
@@ -191,16 +169,6 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -245,16 +213,6 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@@ -298,10 +256,6 @@ typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
 typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
 typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzVector>  LatticeLorentzVector;
 typedef Lattice<vLorentzVectorF> LatticeLorentzVectorF;
 typedef Lattice<vLorentzVectorD> LatticeLorentzVectorD;
 // DoubleStored gauge field
 typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
 typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
--- a/Grid/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@@ -30,7 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#pragma once
+#ifndef GRID_QCD_ACTION_H
 #define GRID_QCD_ACTION_H
 ////////////////////////////////////////////
 // Abstract base interface
@@ -50,4 +51,4 @@ NAMESPACE_CHECK(Fermion);
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
 NAMESPACE_CHECK(PseudoFermion);
-
+#endif
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -40,31 +40,8 @@ class Action
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
  }
  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return deriv_us; };
  RealD refresh_timer(void)      { return deriv_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  // Heatbath?
-  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
+  virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  virtual std::string action_name()    = 0;                             // return the action name
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -58,8 +58,6 @@ NAMESPACE_CHECK(Scalar);
 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/domains/Domains.h>
 #include <Grid/qcd/utils/Metric.h>
 NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -36,34 +36,28 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+  Coordinate twists;
-                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
+  GparityWilsonImplParams() : twists(Nd, 0) {};
  bool locally_periodic;
  GparityWilsonImplParams() : twists(Nd, 0), locally_periodic(false) {};
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  bool locally_periodic;
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
      locally_periodic = false;
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    locally_periodic = false;
  }
 };
 struct StaggeredImplParams {
-  bool locally_periodic;
+  StaggeredImplParams()  {};
  StaggeredImplParams() : locally_periodic(false) {};
 };
-struct OneFlavourRationalParams : Serializable {
+  struct OneFlavourRationalParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
 				    RealD, lo, 
 				    RealD, hi, 
@@ -91,50 +85,6 @@ struct OneFlavourRationalParams : Serializable {
        precision(_precision),
        BoundsCheckFreq(_BoundsCheckFreq){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DDHMCFilter.h
+++ b/Grid/qcd/action/domains/DDHMCFilter.h
@@ -1,52 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/DDHMC.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // DDHMC filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 template<typename MomentaField>
 struct DDHMCFilter: public MomentumFilterBase<MomentaField>
 {
  Coordinate Block;
  int Width;
  DDHMCFilter(const Coordinate &_Block): Block(_Block) {}
  void applyFilter(MomentaField &P) const override
  {
    DomainDecomposition Domains(Block);
    Domains.ProjectDDHMC(P);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DirichletFilter.h
+++ b/Grid/qcd/action/domains/DirichletFilter.h
@@ -1,98 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/momentum/DirichletFilter.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 #include <Grid/qcd/action/domains/DomainDecomposition.h>
 NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct DirichletFilter: public MomentumFilterBase<MomentaField>
 {
  Coordinate Block;
  DirichletFilter(const Coordinate &_Block): Block(_Block) {}
  // Edge detect using domain projectors
  void applyFilter (MomentaField &U) const override
  {
    DomainDecomposition Domains(Block);
    GridBase *grid = U.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid);   one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger  omega(grid);
    LatticeInteger  omegabar(grid);
    LatticeInteger  tmp(grid);
    omega=one;    Domains.ProjectDomain(omega,0);
    omegabar=one; Domains.ProjectDomain(omegabar,1);
    LatticeInteger nface(grid); nface=Zero();
    MomentaField projected(grid); projected=Zero();
    typedef decltype(PeekIndex<LorentzIndex>(U,0)) MomentaLinkField;
    MomentaLinkField  Umu(grid);
    MomentaLinkField   zz(grid); zz=Zero();
    int dims = grid->Nd();
    Coordinate Global=grid->GlobalDimensions();
    assert(dims==Nd);
    for(int mu=0;mu<Nd;mu++){
      if ( Block[mu]!=0 ) {
 	Umu = PeekIndex<LorentzIndex>(U,mu);
 	// Upper face 
 	tmp = Cshift(omegabar,mu,1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	Umu = where(face,zz,Umu);
 	PokeIndex<LorentzIndex>(U, Umu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DomainDecomposition.h
+++ b/Grid/qcd/action/domains/DomainDecomposition.h
@@ -1,187 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/domains/DomainDecomposition.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 NAMESPACE_BEGIN(Grid);
 struct DomainDecomposition
 {
  Coordinate Block;
  static constexpr RealD factor = 0.6;
  DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);};
  template<class Field>
  void ProjectDomain(Field &f,Integer domain)
  {
    GridBase *grid = f.Grid();
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Field   zz(grid);  zz = Zero();
    LatticeInteger coor(grid);
    LatticeInteger domaincoor(grid);
    LatticeInteger mask(grid); mask = Integer(1);
    LatticeInteger zi(grid);     zi = Integer(0);
    for(int d=0;d<Nd;d++){
      Integer B= Block[d];
      if ( B ) {
 	LatticeCoordinate(coor,d+isDWF);
 	domaincoor = mod(coor,B);
 	mask = where(domaincoor==Integer(0),zi,mask);
 	mask = where(domaincoor==Integer(B-1),zi,mask);
      }
    }
    if ( !domain )
      f = where(mask==Integer(1),f,zz);
    else 
      f = where(mask==Integer(0),f,zz);
  };
  template<class GaugeField>
  void ProjectDDHMC(GaugeField &U)
  {
    GridBase *grid = U.Grid();
    Coordinate Global=grid->GlobalDimensions();
    GaugeField zzz(grid); zzz = Zero();
    LatticeInteger coor(grid); 
    GaugeField Uorg(grid); Uorg = U;
    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
    ////////////////////////////////////////////////////
    // Zero BDY layers
    ////////////////////////////////////////////////////
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	LatticeCoordinate(coor,mu);
 	////////////////////////////////
 	// OmegaBar - zero all links contained in slice B-1,0 and
 	// mu links connecting to Omega
 	////////////////////////////////
 	U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    ////////////////////////////////////////////
    // Omega interior slow the evolution
    // Tricky as we need to take the smallest of values imposed by each cut
    // Do them in order or largest to smallest and smallest writes last
    ////////////////////////////////////////////
    RealD f= factor;
 #if 0    
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-5),Uorg*f,U); 
 	U = where(mod(coor,B1)==Integer(4)   ,Uorg*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-6),Uorg_mu*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(4)   ,Uorg_mu*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
 #endif
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-4),Uorg*f*f,U); 
 	U = where(mod(coor,B1)==Integer(3)   ,Uorg*f*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-5),Uorg_mu*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(3)   ,Uorg_mu*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-3),Uorg*f*f*f,U); 
 	U = where(mod(coor,B1)==Integer(2)   ,Uorg*f*f*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-4),Uorg_mu*f*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(2)   ,Uorg_mu*f*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-2),zzz,U); 
 	U = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	// Perp links
 	U_mu = where(mod(coor,B1)==Integer(B1-3),Uorg_mu*f*f*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(1)   ,Uorg_mu*f*f*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/Domains.h
+++ b/Grid/qcd/action/domains/Domains.h
@@ -1,39 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/momentum/Domains.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 #include <Grid/qcd/action/domains/DomainDecomposition.h>
 #include <Grid/qcd/action/domains/MomentumFilter.h>
 #include <Grid/qcd/action/domains/DirichletFilter.h>
 #include <Grid/qcd/action/domains/DDHMCFilter.h>
--- a/Grid/qcd/action/domains/MomentumFilter.h
+++ b/Grid/qcd/action/domains/MomentumFilter.h
@@ -1,91 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h
 Copyright (C) 2015
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 #pragma once 
 NAMESPACE_BEGIN(Grid);
 //These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh
 template<typename MomentaField>
 struct MomentumFilterBase{
  virtual void applyFilter(MomentaField &P) const = 0;
 };
 //Do nothing
 template<typename MomentaField>
 struct MomentumFilterNone: public MomentumFilterBase<MomentaField>{
  void applyFilter(MomentaField &P) const override{}
 };
 //Multiply each site/direction by a Lorentz vector complex number field
 //Can be used to implement a mask, zeroing out sites
 template<typename MomentaField>
 struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
  typedef iVector<iScalar<iScalar<vector_type> >, Nd > LorentzScalarType; //complex phase for each site/direction
  typedef Lattice<LorentzScalarType> LatticeLorentzScalarType;
  LatticeLorentzScalarType phase;
  MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){}
  //Default to uniform field of (1,0)
  MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){
    LorentzScalarType one;
    for(int mu=0;mu<Nd;mu++)
      one(mu)()() = scalar_type(1.);
    phase = one;
  }
  void applyFilter(MomentaField &P) const override{
    conformable(P,phase);
    autoView( P_v , P, AcceleratorWrite);
    autoView( phase_v , phase, AcceleratorRead);
    accelerator_for(ss,P_v.size(),MomentaField::vector_type::Nsimd(),{
    	auto site_mom = P_v(ss);
    	auto site_phase = phase_v(ss);
 	for(int mu=0;mu<Nd;mu++)
 	  site_mom(mu) = site_mom(mu) * site_phase(mu);
    	coalescedWrite(P_v[ss], site_mom);
      });
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -60,8 +60,6 @@ public:
  ///////////////////////////////////////////////////////////////
  virtual void Dminus(const FermionField &psi, FermionField &chi);
  virtual void DminusDag(const FermionField &psi, FermionField &chi);
  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported);
  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported);
  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
  virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
  virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
--- a/Grid/qcd/action/fermion/DirichletFermionOperator.h
+++ b/Grid/qcd/action/fermion/DirichletFermionOperator.h
@@ -1,185 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/DirichletFermionOperator.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////
 // Wrap a fermion operator in Dirichlet BC's at node boundary
 ////////////////////////////////////////////////////////////////
 template<class Impl>
 class DirichletFermionOperator : public FermionOperator<Impl>
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  // Data members
  int CommsMode;
  Coordinate Block;
  DirichletFilter<GaugeField> Filter;
  FermionOperator<Impl> & FermOp;
  // Constructor / bespoke
  DirichletFermionOperator(FermionOperator<Impl> & _FermOp, Coordinate &_Block)
    : FermOp(_FermOp), Block(_Block), Filter(Block)
  {
    // Save what the comms mode should be under normal BCs
    CommsMode = WilsonKernelsStatic::Comms;
    assert((CommsMode == WilsonKernelsStatic::CommsAndCompute)
         ||(CommsMode == WilsonKernelsStatic::CommsThenCompute));
    // Check the block size divides local lattice
    GridBase *grid = FermOp.GaugeGrid();
    int blocks_per_rank = 1;
    Coordinate LocalDims = grid->LocalDimensions();
    Coordinate GlobalDims= grid->GlobalDimensions();
    assert(Block.size()==LocalDims.size());
    for(int d=0;d<LocalDims.size();d++){
      if (Block[d]&&(Block[d]<=GlobalDims[d])){
 	int r = LocalDims[d] % Block[d];
 	assert(r == 0);
 	blocks_per_rank *= (LocalDims[d] / Block[d]);
      }
    }
    // Even blocks per node required // could be relaxed but inefficient use of hardware as idle nodes in boundary operator R
    assert( blocks_per_rank != 0);
    // Possible checks that SIMD lanes are used with full occupancy???
  };
  virtual ~DirichletFermionOperator(void) = default;
  void DirichletOn(void)   {
    assert(WilsonKernelsStatic::Comms!= WilsonKernelsStatic::CommsDirichlet);
    //    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsDirichlet;
  }
  void DirichletOff(void)  {
    //    assert(WilsonKernelsStatic::Comms== WilsonKernelsStatic::CommsDirichlet);
    //    WilsonKernelsStatic::Comms = CommsMode;
  }
  // Implement the full interface
  virtual FermionField &tmp(void) { return FermOp.tmp(); };
  virtual GridBase *FermionGrid(void)         { return FermOp.FermionGrid(); }
  virtual GridBase *FermionRedBlackGrid(void) { return FermOp.FermionRedBlackGrid(); }
  virtual GridBase *GaugeGrid(void)           { return FermOp.GaugeGrid(); }
  virtual GridBase *GaugeRedBlackGrid(void)   { return FermOp.GaugeRedBlackGrid(); }
  // override multiply
  virtual void  M    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.M(in,out);    DirichletOff();  };
  virtual void  Mdag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mdag(in,out); DirichletOff();  };
  // half checkerboard operaions
  virtual void   Meooe       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Meooe(in,out);    DirichletOff(); };  
  virtual void   MeooeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MeooeDag(in,out); DirichletOff(); };
  virtual void   Mooee       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mooee(in,out);    DirichletOff(); };
  virtual void   MooeeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeDag(in,out); DirichletOff(); };
  virtual void   MooeeInv    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInv(in,out); DirichletOff(); };
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInvDag(in,out); DirichletOff(); };
  // non-hermitian hopping term; half cb or both
  virtual void Dhop  (const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.Dhop(in,out,dag);    DirichletOff(); };
  virtual void DhopOE(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopOE(in,out,dag);  DirichletOff(); };
  virtual void DhopEO(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopEO(in,out,dag);  DirichletOff(); };
  virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp) { DirichletOn(); FermOp.DhopDir(in,out,dir,disp);  DirichletOff(); };
  // force terms; five routines; default to Dhop on diagonal
  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MDeriv(mat,U,V,dag);};
  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MoeDeriv(mat,U,V,dag);};
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeoDeriv(mat,U,V,dag);};
  virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MooDeriv(mat,U,V,dag);};
  virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeeDeriv(mat,U,V,dag);};
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDeriv(mat,U,V,dag);};
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivEO(mat,U,V,dag);};
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivOE(mat,U,V,dag);};
  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp){FermOp.Mdir(in,out,dir,disp);};
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)    {FermOp.MdirAll(in,out);};
  ///////////////////////////////////////////////
  // Updates gauge field during HMC
  ///////////////////////////////////////////////
  DoubledGaugeField &GetDoubledGaugeField(void){ return FermOp.GetDoubledGaugeField(); };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return FermOp.GetDoubledGaugeFieldE(); };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return FermOp.GetDoubledGaugeFieldO(); };
  virtual void ImportGauge(const GaugeField & _U)
  {
    GaugeField U = _U;
    // Filter gauge field to apply Dirichlet
    Filter.applyFilter(U);
    FermOp.ImportGauge(U);
  }
  ///////////////////////////////////////////////
  // Physical field import/export
  ///////////////////////////////////////////////
  virtual void Dminus(const FermionField &psi, FermionField &chi)    { FermOp.Dminus(psi,chi); }
  virtual void DminusDag(const FermionField &psi, FermionField &chi) { FermOp.DminusDag(psi,chi); }
  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)   { FermOp.ImportFourDimPseudoFermion(input,imported);}
  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported){ FermOp.ExportFourDimPseudoFermion(solution,exported);}
  virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)  { FermOp.ImportPhysicalFermionSource(input,imported);}
  virtual void ImportUnphysicalFermion(const FermionField &input,FermionField &imported)      { FermOp.ImportUnphysicalFermion(input,imported);}
  virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported) {FermOp.ExportPhysicalFermionSolution(solution,exported);}
  virtual void ExportPhysicalFermionSource(const FermionField &solution,FermionField &exported)   {FermOp.ExportPhysicalFermionSource(solution,exported);}
  //////////////////////////////////////////////////////////////////////
  // Should never be used
  //////////////////////////////////////////////////////////////////////
  virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {assert(0);}
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { assert(0);}
  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
 					PropagatorField &q_in_2,
 					PropagatorField &q_out,
 					PropagatorField &phys_src,
 					Current curr_type,
 					unsigned int mu)
  {assert(0);};
  virtual void SeqConservedCurrent(PropagatorField &q_in, 
 				   PropagatorField &q_out,
 				   PropagatorField &phys_src,
 				   Current curr_type,
 				   unsigned int mu,
 				   unsigned int tmin, 
 				   unsigned int tmax,
 				   ComplexField &lattice_cmplx)
  {assert(0);};
      // Only reimplemented in Wilson5D 
      // Default to just a zero correlation function
  virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=Zero(); };
  virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=Zero(); };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -101,12 +101,6 @@ NAMESPACE_CHECK(WilsonTM5);
 #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
 #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 #include <Grid/qcd/action/fermion/MADWF.h>
 ////////////////////////////////////////////////////////////////////
 // DDHMC related 
 ////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/DirichletFermionOperator.h>
 #include <Grid/qcd/action/fermion/SchurFactoredFermionOperator.h>
 NAMESPACE_CHECK(DWFutils);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -121,9 +115,9 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
-//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
+typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
-//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
+typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
-//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
+typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
 typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
@@ -164,41 +158,41 @@ typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
-//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
+typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
-//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
+typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
-//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
+typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
-//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
+typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
-//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
+typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
-//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
+typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
-//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
+typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
-//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
+typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
-//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
+typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
-//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
+typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
-//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
+typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
-//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
+typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
-//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
+typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
-//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
+typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
-//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
+typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 // Ls vectorised
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
@@ -241,49 +235,49 @@ typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
-//typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
+typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
-//typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
+typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
-//typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
+typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
 typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
-//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
+typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
-//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
+typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
-//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
+typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
-//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
+typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
-//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
+typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
-//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
+typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
-//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
+typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
-//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
+typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
-//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
+typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
 typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
-//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
+typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
-//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
+typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
-//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
+typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
-//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
+typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
-//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
+typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
-//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
+typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
@@ -297,6 +291,12 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
 #ifndef GRID_CUDA
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
 #endif
 NAMESPACE_END(Grid);
 ////////////////////
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -25,7 +25,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#pragma once
+#ifndef  GRID_QCD_FERMION_CORE_H
 #define  GRID_QCD_FERMION_CORE_H
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
@@ -44,3 +45,4 @@ NAMESPACE_CHECK(FermionOperator);
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
 NAMESPACE_CHECK(Kernels);
 #endif
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -140,9 +140,6 @@ public:
  // Updates gauge field during HMC
  ///////////////////////////////////////////////
  virtual void ImportGauge(const GaugeField & _U)=0;
  virtual DoubledGaugeField &GetDoubledGaugeField(void)  =0;
  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void)  =0;
  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void)  =0;
  //////////////////////////////////////////////////////////////////////
  // Conserved currents, either contract at sink or insert sequentially.
@@ -174,16 +171,6 @@ public:
      ///////////////////////////////////////////////
      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
      virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)
      {
 	imported = input;
      };
      virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported)
      {
 	exported=solution;
      };
      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
      {
 	imported = input;
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -153,8 +153,8 @@ public:
  typedef typename Impl::StencilImpl             StencilImpl;		\
  typedef typename Impl::ImplParams               ImplParams;	        \
  typedef typename Impl::StencilImpl::View_type  StencilView;		\
-  typedef const typename ViewMap<FermionField>::Type      FermionFieldView;	\
+  typedef typename ViewMap<FermionField>::Type      FermionFieldView;	\
-  typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
+  typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
 #define INHERIT_IMPL_TYPES(Base)		\
  INHERIT_GIMPL_TYPES(Base)			\
@@ -183,8 +183,7 @@ NAMESPACE_CHECK(ImplStaggered);
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
-// Deprecate Vec5d
+#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
-//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
+NAMESPACE_CHECK(ImplStaggered5dVec);  
 //NAMESPACE_CHECK(ImplStaggered5dVec);  
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,18 +30,6 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -109,30 +97,42 @@ public:
    Coordinate icoor;
 #ifdef GRID_SIMT
    _Spinor tmp;
    const int Nsimd =SiteDoubledGaugeField::Nsimd();
    int s = acceleratorSIMTlane(Nsimd);
    St.iCoorFromIindex(icoor,s);
    int mmu = mu % Nd;
    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
      int permute_lane = (sl==1) 
    	|| ((distance== 1)&&(icoor[direction]==1))
 	|| ((distance==-1)&&(icoor[direction]==0));
-    auto UU0=coalescedRead(U(0)(mu));
+      if ( permute_lane ) { 
-    auto UU1=coalescedRead(U(1)(mu));
+	tmp(0) = chi(1);
-    
+	tmp(1) = chi(0);
-    //Decide whether we do a G-parity flavor twist
+      } else {
-    //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
+	tmp(0) = chi(0);
-    //It also assumes (but does not check) that abs(distance) == 1
+	tmp(1) = chi(1);
-    int permute_lane = (sl==1) 
+      }
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
+      auto UU0=coalescedRead(U(0)(mu));
      auto UU1=coalescedRead(U(1)(mu));
-    //Apply the links
+      mult(&phi(0),&UU0,&tmp(0));
-    int f_upper = permute_lane ? 1 : 0;
+      mult(&phi(1),&UU1,&tmp(1));
    int f_lower = !f_upper;
-    mult(&phi(0),&UU0,&chi(f_upper));
+    } else {
-    mult(&phi(1),&UU1,&chi(f_lower));
+
      auto UU0=coalescedRead(U(0)(mu));
      auto UU1=coalescedRead(U(1)(mu));
      mult(&phi(0),&UU0,&chi(0));
      mult(&phi(1),&UU1,&chi(1));
    }
 #else
    typedef _Spinor vobj;
@@ -151,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
+
      if ( sl == 2 ) {
-	//Only do the twist for lanes on the edge of the physical node
+       
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@@ -209,19 +209,6 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -232,19 +219,14 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-
+        
-    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    for(int mu=0;mu<Nd;mu++){
-    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+          
-    for(int mu=0;mu<Nd-1;mu++){
+      LatticeCoordinate(coor,mu);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -259,7 +241,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	});
+	  });
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -290,38 +272,6 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -360,48 +310,28 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls=Btilde.Grid()->_fdimensions[0];
    {
      GridBase *GaugeGrid = mat.Grid();
      Lattice<iScalar<vInteger> > coor(GaugeGrid);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      autoView( mat_v , mat, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls = Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
      autoView( tmp_v , tmp, CpuWrite);
      autoView( Atilde_v , Atilde, CpuRead);
      autoView( Btilde_v , Btilde, CpuRead);
      thread_for(ss,tmp.Grid()->oSites(),{
 	  for (int s = 0; s < Ls; s++) {
 	    int sF = s + Ls * ss;
 	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
 	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
 	  }
 	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
@@ -409,8 +339,8 @@ typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> Gparit
 typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
 typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
-//typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
-//typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
-//typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -141,11 +141,8 @@ public:
  void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
+  DoubledGaugeField &GetU(void)   { return Umu ; } ;
-  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
+  DoubledGaugeField &GetUUU(void) { return UUUmu; };
  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
  virtual DoubledGaugeField &GetU(void)   { return Umu ; } ;
  virtual DoubledGaugeField &GetUUU(void) { return UUUmu; };
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -160,20 +160,17 @@ public:
 			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
 			     const ImplParams &p= ImplParams());
-  // DoubleStore gauge field in operator
+    // DoubleStore gauge field in operator
-  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
  void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
-  void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
+    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
-  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-  // Give a reference; can be used to do an assignment or copy back out after import
+    // Give a reference; can be used to do an assignment or copy back out after import
-  // if Carleton wants to cache them and not use the ImportSimple
+    // if Carleton wants to cache them and not use the ImportSimple
-  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
+    DoubledGaugeField &GetU(void)   { return Umu ; } ;
-  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
+    DoubledGaugeField &GetUUU(void) { return UUUmu; };
-  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
+    void CopyGaugeCheckerboards(void);
-  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+    
  DoubledGaugeField &GetUUU(void) { return UUUmu; };
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
@@ -211,7 +208,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
-  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -85,7 +85,7 @@ class MADWF
      maxiter     =_maxiter;
    };
-  void operator() (const FermionFieldo &src,FermionFieldo &sol5)
+  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
  {
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
@@ -114,16 +114,8 @@ class MADWF
    ///////////////////////////////////////
    //Import source, include Dminus factors
    ///////////////////////////////////////
-    GridBase *src_grid = src.Grid();
+    Mato.ImportPhysicalFermionSource(src4,b); 
-
+    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
    assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
    if ( src_grid == Mato.GaugeGrid() ) {
      Mato.ImportPhysicalFermionSource(src,b);
    } else {
      b=src;
    }
    std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;
    defect = b;
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -135,9 +135,6 @@ public:
  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_U );
  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  void CopyGaugeCheckerboards(void);
--- a/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
+++ b/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
@@ -1,534 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/SchurFactoredFermionOperator.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
 #include <Grid/qcd/action/domains/Domains.h>
 NAMESPACE_BEGIN(Grid);
  ////////////////////////////////////////////////////////
  // Some explanation of class structure for domain decomposition:
  //
  // Need a dirichlet operator for two flavour determinant - acts on both Omega and OmegaBar.
  //
  // Possible gain if the global sums and CG are run independently?? Could measure this.
  //
  // Types of operations
  //
  // 1) assemble local det dOmega det dOmegaBar pseudofermion
  //
  // - DirichletFermionOperator - can either do a global solve, or independent/per cell coefficients.
  //
  // 2) assemble dOmegaInverse and dOmegaBarInverse in R
  //
  // - DirichletFermionOperator - can also be used to 
  //                                       - need two or more cells per node. Options
  //                                       - a) solve one cell at a time, no new code, CopyRegion and reduced /split Grids
  //                                       - b) solve multiple cells in parallel. predicated dslash implementation
  //
  //                                       - b) has more parallelism, experience with block solver suggest might not be aalgorithmically inefficient
  //                                         a) has more cache friendly and easier code.
  //                                         b) is easy to implement in a "trial" or inefficient code with projection.
  //
  // 3)  Additional functionality for domain operations
  //
  // - SchurFactoredFermionOperator  - Need a DDHMC utility - whether used in two flavour or one flavour 
  //
  // - dBoundary - needs non-dirichlet operator
  // - Contains one Dirichlet Op, and one non-Dirichlet op. Implements dBoundary etc...
  // - The Dirichlet ops can be passed to dOmega(Bar) solvers etc...
  //
  ////////////////////////////////////////////////////////
 template<class ImplD,class ImplF>
 class SchurFactoredFermionOperator : public ImplD
 {
  INHERIT_IMPL_TYPES(ImplD);
  typedef typename ImplF::FermionField FermionFieldF;
  typedef typename ImplD::FermionField FermionFieldD;
  typedef SchurDiagMooeeOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorDagD;
  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorDagF;
  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
 							  FermionOperator<ImplF>,
 							  LinearOperatorD,
 							  LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
 							  FermionOperator<ImplF>,
 							  LinearOperatorDagD,
 							  LinearOperatorDagF> MxDagPCG;
 public:
  GridBase *FermionGrid(void) { return PeriodicFermOpD.FermionGrid(); };
  GridBase *GaugeGrid(void)   { return PeriodicFermOpD.GaugeGrid(); };
  FermionOperator<ImplD> & DirichletFermOpD;
  FermionOperator<ImplF> & DirichletFermOpF;
  FermionOperator<ImplD> & PeriodicFermOpD; 
  FermionOperator<ImplF> & PeriodicFermOpF; 
  LinearOperatorD DirichletLinOpD;
  LinearOperatorF DirichletLinOpF;
  LinearOperatorD PeriodicLinOpD;
  LinearOperatorF PeriodicLinOpF;
  LinearOperatorDagD DirichletLinOpDagD;
  LinearOperatorDagF DirichletLinOpDagF;
  LinearOperatorDagD PeriodicLinOpDagD;
  LinearOperatorDagF PeriodicLinOpDagF;
  // Can tinker with these in the pseudofermion for force vs. action solves
  Integer maxinnerit;
  Integer maxouterit;
  RealD tol;
  RealD tolinner;
  Coordinate Block;
  DomainDecomposition Domains;
  SchurFactoredFermionOperator(FermionOperator<ImplD>  & _PeriodicFermOpD,
 			       FermionOperator<ImplF>  & _PeriodicFermOpF,
 			       FermionOperator<ImplD>  & _DirichletFermOpD,
 			       FermionOperator<ImplF>  & _DirichletFermOpF,
 			       Coordinate &_Block)
    : Block(_Block), Domains(Block),
      PeriodicFermOpD(_PeriodicFermOpD),
      PeriodicFermOpF(_PeriodicFermOpF),
      DirichletFermOpD(_DirichletFermOpD),
      DirichletFermOpF(_DirichletFermOpF),
      DirichletLinOpD(DirichletFermOpD),
      DirichletLinOpF(DirichletFermOpF),
      PeriodicLinOpD(PeriodicFermOpD),
      PeriodicLinOpF(PeriodicFermOpF),
      DirichletLinOpDagD(DirichletFermOpD),
      DirichletLinOpDagF(DirichletFermOpF),
      PeriodicLinOpDagD(PeriodicFermOpD),
      PeriodicLinOpDagF(PeriodicFermOpF)
  {
    tol=1.0e-10;
    tolinner=1.0e-6;
    maxinnerit=1000;
    maxouterit=10;
    assert(PeriodicFermOpD.FermionGrid() == DirichletFermOpD.FermionGrid());
    assert(PeriodicFermOpF.FermionGrid() == DirichletFermOpF.FermionGrid());
  };
  enum Domain { Omega=0, OmegaBar=1 };
  void ImportGauge(const GaugeField &Umu)
  {
    // Single precision will update in the mixed prec CG
    PeriodicFermOpD.ImportGauge(Umu);
    GaugeField dUmu(Umu.Grid());
    dUmu=Umu;
    //    DirchletBCs(dUmu);
    DirichletFilter<GaugeField> Filter(Block);
    Filter.applyFilter(dUmu);
    DirichletFermOpD.ImportGauge(dUmu);
  }
 /*
  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
  {
    assert((sgn==1)||(sgn==-1));
    Real rsgn = sgn;
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
      Gamma::Algebra::GammaY,
      Gamma::Algebra::GammaZ,
      Gamma::Algebra::GammaT
    };
    GridBase *grid = f.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid); one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger nface(grid); nface=Zero();
    FermionField projected(grid); projected=Zero();
    FermionField sp_proj  (grid);
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Coordinate Global=grid->GlobalDimensions();
    for(int mu=0;mu<Nd;mu++){
      if ( Block[mu] <= Global[mu+isDWF] ) {
 	// need to worry about DWF 5th dim first
 	LatticeCoordinate(coor,mu+isDWF); 
 	face = where(mod(coor,Block[mu]) == Integer(0),one,zero );
 	nface = nface + face;
 	Gamma G(Gmu[mu]);
 	// Lower face receives (1-gamma)/2 in normal forward hopping term
 	sp_proj  = 0.5*(f-G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	//projected= where(face,f,projected);
 	face = where(mod(coor,Block[mu]) == Integer(Block[mu]-1) ,one,zero );
 	nface = nface + face;
 	// Upper face receives (1+gamma)/2 in normal backward hopping term
 	sp_proj = 0.5*(f+G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	//projected= where(face,f,projected);
      }
    }
    // Initial Zero() where nface==0.
    // Keep the spin projected faces where nface==1
    // Full spinor where nface>=2
    projected = where(nface>Integer(1),f,projected);
    f=projected;
  }
 */
  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
  {
    assert((sgn==1)||(sgn==-1));
    Real rsgn = sgn;
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
      Gamma::Algebra::GammaY,
      Gamma::Algebra::GammaZ,
      Gamma::Algebra::GammaT
    };
    GridBase *grid = f.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid);   one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger  omega(grid);
    LatticeInteger  omegabar(grid);
    LatticeInteger  tmp(grid);
    omega=one;    Domains.ProjectDomain(omega,0);
    omegabar=one; Domains.ProjectDomain(omegabar,1);
    LatticeInteger nface(grid); nface=Zero();
    FermionField projected(grid); projected=Zero();
    FermionField sp_proj  (grid);
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Coordinate Global=grid->GlobalDimensions();
    for(int mmu=0;mmu<Nd;mmu++){
      Gamma G(Gmu[mmu]);
      // need to worry about DWF 5th dim first
      int mu = mmu+isDWF;
      if ( Block[mmu] && (Block[mmu] <= Global[mu]) ) {
 	// Lower face receives (1-gamma)/2 in normal forward hopping term
 	tmp = Cshift(omegabar,mu,-1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,-1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	nface = nface + face;
 	sp_proj  = 0.5*(f-G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	// Upper face receives (1+gamma)/2 in normal backward hopping term
 	tmp = Cshift(omegabar,mu,1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	nface = nface + face;
 	sp_proj = 0.5*(f+G*f*rsgn);
 	projected= where(face,sp_proj,projected);
      }
    }
    // Initial Zero() where nface==0.
    // Keep the spin projected faces where nface==1
    // Full spinor where nface>=2
    projected = where(nface>Integer(1),f,projected);
    f=projected;
  }
  void ProjectDomain(FermionField &f,int domain)
  {
 /*
    GridBase *grid = f.Grid();
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    FermionField zz(grid); zz=Zero();
    LatticeInteger coor(grid);
    LatticeInteger domaincb(grid); domaincb=Zero();
    for(int d=0;d<Nd;d++){
      LatticeCoordinate(coor,d+isDWF);
      domaincb = domaincb + div(coor,Block[d]);
    }
    f = where(mod(domaincb,2)==Integer(domain),f,zz);
 */
    Domains.ProjectDomain(f,domain);
  };
  void ProjectOmegaBar   (FermionField &f) {ProjectDomain(f,OmegaBar);}
  void ProjectOmega      (FermionField &f) {ProjectDomain(f,Omega);}
  // See my notes(!).
  // Notation: Following Luscher, we introduce projectors $\hPdb$ with both spinor and space structure
  // projecting all spinor elements in $\Omega$ connected by $\Ddb$ to $\bar{\Omega}$,
  void ProjectBoundaryBar(FermionField &f)
  {
    ProjectBoundaryBothDomains(f,1);
    ProjectOmega(f);
  }
  // and $\hPd$ projecting all spinor elements in $\bar{\Omega}$ connected by $\Dd$ to $\Omega$.
  void ProjectBoundary   (FermionField &f)
  {
    ProjectBoundaryBothDomains(f,1);
    ProjectOmegaBar(f);
    //    DumpSliceNorm("ProjectBoundary",f,f.Grid()->Nd()-1);
  };
  void dBoundary    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    PeriodicFermOpD.M(tmp,out);
    ProjectOmega(out);
  };
  void dBoundaryDag (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    PeriodicFermOpD.Mdag(tmp,out);
    ProjectOmegaBar(out);
  };
  void dBoundaryBar (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    PeriodicFermOpD.M(tmp,out);
    ProjectOmegaBar(out);
  };
  void dBoundaryBarDag (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    PeriodicFermOpD.Mdag(tmp,out);
    ProjectOmega(out);
  };
  void dOmega       (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    DirichletFermOpD.M(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBar    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    DirichletFermOpD.M(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaDag       (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    DirichletFermOpD.Mdag(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBarDag    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    DirichletFermOpD.Mdag(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaInv   (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    dOmegaInvAndOmegaBarInv(tmp,out); // Inefficient warning
    ProjectOmega(out);
  };
  void dOmegaBarInv(FermionField &in,FermionField &out)
  {    
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    dOmegaInvAndOmegaBarInv(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaDagInv   (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBarDagInv(FermionField &in,FermionField &out)
  {    
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaInvAndOmegaBarInv(FermionField &in,FermionField &out)
  {
    MxPCG OmegaSolver(tol,
 		      tolinner,
 		      maxinnerit,
 		      maxouterit,
 		      DirichletFermOpF.FermionRedBlackGrid(),
 		      DirichletFermOpF,
 		      DirichletFermOpD,
 		      DirichletLinOpF,
 		      DirichletLinOpD);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(OmegaSolver);
    PrecSolve(DirichletFermOpD,in,out);
  };
  void dOmegaDagInvAndOmegaBarDagInv(FermionField &in,FermionField &out)
  {
    MxDagPCG OmegaDagSolver(tol,
 			    tolinner,
 			    maxinnerit,
 			    maxouterit,
 			    DirichletFermOpF.FermionRedBlackGrid(),
 			    DirichletFermOpF,
 			    DirichletFermOpD,
 			    DirichletLinOpDagF,
 			    DirichletLinOpDagD);
    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecSolve(OmegaDagSolver);
    PrecSolve(DirichletFermOpD,in,out);
  };
  // Rdag = Pdbar - DdbarDag DomegabarDagInv  DdDag DomegaDagInv Pdbar 
  void RDag(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    FermionField tmp2(PeriodicFermOpD.FermionGrid());
    out = in;
    ProjectBoundaryBar(out);
    dOmegaDagInv(out,tmp1);   
    dBoundaryDag(tmp1,tmp2);   
    dOmegaBarDagInv(tmp2,tmp1);
    dBoundaryBarDag(tmp1,tmp2); 
    out = out - tmp2;
  };
  // R = Pdbar - Pdbar DomegaInv Dd DomegabarInv Ddbar
  void R(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    FermionField tmp2(PeriodicFermOpD.FermionGrid());
    out = in;
    ProjectBoundaryBar(out);
    dBoundaryBar(out,tmp1); 
    dOmegaBarInv(tmp1,tmp2);
    dBoundary(tmp2,tmp1);   
    dOmegaInv(tmp1,tmp2);   
    out = in - tmp2 ;       
    ProjectBoundaryBar(out);
    //    DumpSliceNorm("R",out,out.Grid()->Nd()-1);
  };
  // R = Pdbar - Pdbar Dinv Ddbar 
  void RInv(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    dBoundaryBar(in,out);
    Dinverse(out,tmp1);  
    out =in -tmp1; 
    ProjectBoundaryBar(out);
  };
  // R = Pdbar - DdbarDag DinvDag Pdbar 
  void RDagInv(FermionField &in,FermionField &out)
  {
    FermionField tmp(PeriodicFermOpD.FermionGrid());
    FermionField Pin(PeriodicFermOpD.FermionGrid());
    Pin = in; ProjectBoundaryBar(Pin);
    DinverseDag(Pin,out);  
    dBoundaryBarDag(out,tmp);
    out =Pin -tmp; 
  };
  // Non-dirichlet inverter using red-black preconditioning
  void Dinverse(FermionField &in,FermionField &out)
  {
    MxPCG DSolver(tol,
 		  tolinner,
 		  maxinnerit,
 		  maxouterit,
 		  PeriodicFermOpF.FermionRedBlackGrid(),
 		  PeriodicFermOpF,
 		  PeriodicFermOpD,
 		  PeriodicLinOpF,
 		  PeriodicLinOpD);
    SchurRedBlackDiagMooeeSolve<FermionField> Solve(DSolver);
    Solve(PeriodicFermOpD,in,out);
  }
  void DinverseDag(FermionField &in,FermionField &out)
  {
    MxDagPCG DdagSolver(tol,
 			tolinner,
 			maxinnerit,
 			maxouterit,
 			PeriodicFermOpF.FermionRedBlackGrid(),
 			PeriodicFermOpF,
 			PeriodicFermOpD,
 			PeriodicLinOpDagF,
 			PeriodicLinOpDagD);
    SchurRedBlackDiagMooeeDagSolve<FermionField> Solve(DdagSolver);
    Solve(PeriodicFermOpD,in,out);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -72,23 +72,19 @@ public:
  StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
-  template<class _Spinor>
+  static accelerator_inline void multLink(SiteSpinor &phi,
  static accelerator_inline void multLink(_Spinor &phi,
 		       const SiteDoubledGaugeField &U,
-		       const _Spinor &chi,
+		       const SiteSpinor &chi,
 		       int mu)
  {
-    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &U(mu), &chi());
    mult(&phi(), &UU, &chi());
  }
-  template<class _Spinor>
+  static accelerator_inline void multLinkAdd(SiteSpinor &phi,
  static accelerator_inline void multLinkAdd(_Spinor &phi,
 			  const SiteDoubledGaugeField &U,
-			  const _Spinor &chi,
+			  const SiteSpinor &chi,
 			  int mu)
  {
-    auto UU = coalescedRead(U(mu));
+    mac(&phi(), &U(mu), &chi());
    mac(&phi(), &UU, &chi());
  }
  template <class ref>
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -63,20 +63,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> 
+   template<int Naik> accelerator_inline
   static accelerator_inline
   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   
+   template<int Naik> accelerator_inline
   template<int Naik> static accelerator_inline
   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   
+   template<int Naik> accelerator_inline
   template<int Naik> static accelerator_inline
   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
@@ -85,20 +82,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   
+   template<int Naik> accelerator_inline
   template<int Naik> static accelerator_inline
   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   
+   template<int Naik> accelerator_inline
   template<int Naik> static accelerator_inline
   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   
+   template<int Naik> accelerator_inline
   template<int Naik> static accelerator_inline
   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
@@ -107,7 +101,6 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -245,7 +245,7 @@ public:
    return out;
  }
-protected:
+private:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -61,19 +61,18 @@ public:
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
-  accelerator_inline int CommDatumSize(void) const {
+  accelerator_inline int CommDatumSize(void) {
    return sizeof(SiteHalfCommSpinor);
  }
  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
+  template<class _SiteHalfSpinor, class _SiteSpinor>
-    typedef decltype(coalescedRead(buf)) sobj;
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
-    sobj sp;
+    _SiteHalfSpinor tmp;
-    auto sin = coalescedRead(in);
+    projector::Proj(tmp,in,mu,dag);
-    projector::Proj(sp,sin,mu,dag);
+    vstream(buf[o],tmp);
    coalescedWrite(buf,sp);
  }
  /*****************************************************/
@@ -82,24 +81,19 @@ public:
  accelerator_inline void Exchange(SiteHalfSpinor *mp,
 				   const SiteHalfSpinor * __restrict__ vp0,
 				   const SiteHalfSpinor * __restrict__ vp1,
-				   Integer type,Integer o) const {
+				   Integer type,Integer o){
 #ifdef GRID_SIMT
    exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
 #else
    SiteHalfSpinor tmp1;
    SiteHalfSpinor tmp2;
    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
    vstream(mp[2*o  ],tmp1);
    vstream(mp[2*o+1],tmp2);
 #endif
  }
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
-				     SiteHalfSpinor * __restrict__ in, Integer o) const {    
+				     SiteHalfSpinor * __restrict__ in, Integer o) {    
    assert(0);
  }
@@ -109,30 +103,8 @@ public:
  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
 					   SiteHalfSpinor * __restrict__ out1,
 					   const SiteSpinor * __restrict__ in,
-					   Integer j,Integer k, Integer m,Integer type) const
+					   Integer j,Integer k, Integer m,Integer type)
  {
 #ifdef GRID_SIMT
    typedef SiteSpinor vobj;
    typedef SiteHalfSpinor hvobj;
    typedef decltype(coalescedRead(*in))    sobj;
    typedef decltype(coalescedRead(*out0)) hsobj;
    unsigned int Nsimd = vobj::Nsimd();
    unsigned int mask = Nsimd >> (type + 1);
    int lane = acceleratorSIMTlane(Nsimd);
    int j0 = lane &(~mask); // inner coor zero
    int j1 = lane |(mask) ; // inner coor one
    const vobj *vp0 = &in[k];
    const vobj *vp1 = &in[m];
    const vobj *vp = (lane&mask) ? vp1:vp0;
    auto sa = coalescedRead(*vp,j0);
    auto sb = coalescedRead(*vp,j1);
    hsobj psa, psb;
    projector::Proj(psa,sa,mu,dag);
    projector::Proj(psb,sb,mu,dag);
    coalescedWrite(out0[j],psa);
    coalescedWrite(out1[j],psb);
 #else
    SiteHalfSpinor temp1, temp2;
    SiteHalfSpinor temp3, temp4;
    projector::Proj(temp1,in[k],mu,dag);
@@ -140,17 +112,15 @@ public:
    exchange(temp3,temp4,temp1,temp2,type);
    vstream(out0[j],temp3);
    vstream(out1[j],temp4);
 #endif
  }
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  accelerator_inline bool DecompressionStep(void) const { return false; }
+  accelerator_inline bool DecompressionStep(void) { return false; }
 };
 #if 0
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 				typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
@@ -172,30 +142,20 @@ public:
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
-  accelerator_inline int CommDatumSize(void) const {
+  accelerator_inline int CommDatumSize(void) {
    return sizeof(SiteHalfCommSpinor);
  }
  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
+  template<class _SiteHalfSpinor, class _SiteSpinor>
-    SiteHalfSpinor hsp;
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
    _SiteHalfSpinor hsp;
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
    projector::Proj(hsp,in,mu,dag);
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
  }
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
 #ifdef GRID_SIMT
    typedef decltype(coalescedRead(buf)) sobj;
    sobj sp;
    auto sin = coalescedRead(in);
    projector::Proj(sp,sin,mu,dag);
    coalescedWrite(buf,sp);
 #else
    projector::Proj(buf,in,mu,dag);
 #endif
  }
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
@@ -203,7 +163,7 @@ public:
  accelerator_inline void Exchange(SiteHalfSpinor *mp,
                       SiteHalfSpinor *vp0,
                       SiteHalfSpinor *vp1,
-		       Integer type,Integer o) const {
+		       Integer type,Integer o){
    SiteHalfSpinor vt0,vt1;
    SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
    SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
@@ -215,7 +175,7 @@ public:
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
+  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
  }
@@ -226,7 +186,7 @@ public:
  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
 			       SiteHalfSpinor *out1,
 			       const SiteSpinor *in,
-			       Integer j,Integer k, Integer m,Integer type) const {
+			       Integer j,Integer k, Integer m,Integer type){
    SiteHalfSpinor temp1, temp2,temp3,temp4;
    SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
    SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
@@ -240,10 +200,9 @@ public:
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  accelerator_inline bool DecompressionStep(void) const { return true; }
+  accelerator_inline bool DecompressionStep(void) { return true; }
 };
 #endif
 #define DECLARE_PROJ(Projector,Compressor,spProj)			\
  class Projector {							\
@@ -294,8 +253,33 @@ public:
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;
-  void ZeroCountersi(void)  {  }
+  double timer0;
-  void Reporti(int calls)  {  }
+  double timer1;
  double timer2;
  double timer3;
  double timer4;
  double timer5;
  double timer6;
  uint64_t callsi;
  void ZeroCountersi(void)
  {
    timer0=0;
    timer1=0;
    timer2=0;
    timer3=0;
    timer4=0;
    timer5=0;
    timer6=0;
    callsi=0;
  }
  void Reporti(int calls)
  {
    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
  std::vector<int> surface_list;
@@ -303,11 +287,9 @@ public:
 		int npoints,
 		int checkerboard,
 		const std::vector<int> &directions,
-		const std::vector<int> &distances,
+		const std::vector<int> &distances,Parameters p)  
-		bool locally_periodic,
+    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
-		Parameters p)  
+  { 
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,locally_periodic,p)
  {
    ZeroCountersi();
    surface_list.resize(0);
    this->same_node.resize(npoints);
@@ -339,18 +321,26 @@ public:
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->HaloExchangeOptGather(source,compress);
    double t1=usecond();
    // Asynchronous MPI calls multidirectional, Isend etc...
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
    this->Communicate();
    double t2=usecond(); timer1 += t2-t1;
    this->CommsMerge(compress);
    double t3=usecond(); timer2 += t3-t2;
    this->CommsMergeSHM(compress);
    double t4=usecond(); timer3 += t4-t3;
  }
  template <class compressor>
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
  {
    this->Prepare();
    double t0=usecond();
    this->HaloGatherOpt(source,compress);
    double t1=usecond();
    timer0 += t1-t0;
    callsi++;
  }
  template <class compressor>
@@ -362,9 +352,12 @@ public:
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
    this->mpi3synctime_g-=usecond();
    this->_grid->StencilBarrier();
    this->mpi3synctime_g+=usecond();
    assert(source.Grid()==this->_grid);
    this->halogtime-=usecond();
    this->u_comm_offset=0;
@@ -400,6 +393,7 @@ public:
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    this->halogtime+=usecond();
    accelerator_barrier();
  }
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -50,14 +50,14 @@ public:
  double, nu);
  WilsonAnisotropyCoefficients():
-    isAnisotropic(false),
+    isAnisotropic(false), 
-    t_direction(Nd-1),
+    t_direction(Nd-1), 
-    xi_0(1.0),
+    xi_0(1.0), 
    nu(1.0){}
 };
 template <class Impl>
-class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
+class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic 
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
@@ -74,20 +74,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
@@ -146,24 +132,18 @@ public:
  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
  void DhopInternalDirichletComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 				  const FermionField &in, FermionField &out, int dag);
  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-				   const FermionField &in, FermionField &out, int dag);
+                    const FermionField &in, FermionField &out, int dag);
  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                GridRedBlackCartesian &Hgrid, RealD _mass,
-                const ImplParams &p = ImplParams(),
+                const ImplParams &p = ImplParams(), 
                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
-  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
+
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
@@ -190,9 +170,9 @@ public:
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
-
+  
  WilsonAnisotropyCoefficients anisotropyCoeff;
-
+  
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
@@ -206,7 +186,7 @@ public:
                           PropagatorField &q_out,
                           PropagatorField &phys_src,
                           Current curr_type,
-                           unsigned int mu,
+                           unsigned int mu, 
                           unsigned int tmin,
 			   unsigned int tmax,
 			   ComplexField &lattice_cmplx);
@@ -216,3 +196,5 @@ typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -165,14 +165,7 @@ public:
 			       const FermionField &in, 
 			       FermionField &out,
 			       int dag);
-
+    
  void DhopInternalDirichletComms(StencilImpl & st,
 				  LebesgueOrder &lo,
 				  DoubledGaugeField &U,
 				  const FermionField &in, 
 				  FermionField &out,
 				  int dag);
  // Constructors
  WilsonFermion5D(GaugeField &_Umu,
 		  GridCartesian         &FiveDimGrid,
@@ -181,11 +174,19 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
  // Constructors
  /*
    WilsonFermion5D(int simd, 
    GaugeField &_Umu,
    GridCartesian         &FiveDimGrid,
    GridRedBlackCartesian &FiveDimRedBlackGrid,
    GridCartesian         &FourDimGrid,
    double _M5,const ImplParams &p= ImplParams());
  */
  // DoubleStore
  void ImportGauge(const GaugeField &_Umu);
-  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
+    
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
@@ -214,7 +215,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
-  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 };
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -72,7 +72,7 @@ public:
  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
  typedef WilsonImplParams ImplParams;
  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
-  typedef const typename StencilImpl::View_type StencilView;
+  typedef typename StencilImpl::View_type StencilView;
  ImplParams Params;
@@ -106,15 +106,11 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
  {
    const int Nsimd = SiteHalfSpinor::Nsimd();
    autoView( out_v, out, AcceleratorWrite);
    autoView( phi_v, phi, AcceleratorRead);
    autoView( Umu_v, Umu, AcceleratorRead);
-    typedef decltype(coalescedRead(out_v[0]))   calcSpinor;
+    accelerator_for(sss,out.Grid()->oSites(),1,{
-    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
+	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
 	calcSpinor tmp;
 	multLink(tmp,Umu_v[sss],phi_v(sss),mu);
 	coalescedWrite(out_v[sss],tmp);
    });
  }
@@ -184,22 +180,18 @@ public:
      mat = TraceIndex<SpinIndex>(P); 
    }
-    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    {
      for (int mu = 0; mu < Nd; mu++)
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
    }
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
+
-  {
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-#undef USE_OLD_INSERT_FORCE    
+      
    int Ls=Btilde.Grid()->_fdimensions[0];
    autoView( mat_v , mat, AcceleratorWrite);
 #ifdef USE_OLD_INSERT_FORCE    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
      const int Nsimd = SiteSpinor::Nsimd();
      autoView( tmp_v , tmp, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
@@ -212,29 +204,6 @@ public:
 	});
    }
    PokeIndex<LorentzIndex>(mat,tmp,mu);
 #else
    {
      const int Nsimd = SiteSpinor::Nsimd();
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
 	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
 	  zeroit(sum);  
 	  for(int s=0;s<Ls;s++){
 	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
 	      auto op = outerProduct(bb,aa);
  	      sum = sum + op;
 	    }
 	  }
  	  coalescedWrite(mat_v[sU](mu)(), sum);
      });
    }
 #endif    
  }
 };
@@ -243,17 +212,17 @@ typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
-//typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
-//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
-//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
-//typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
+typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
-//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
+typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
-//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
+typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
 typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
 typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
 class WilsonKernelsStatic { 
 public:
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
-  enum { CommsAndCompute, CommsThenCompute, CommsDirichlet };
+  enum { CommsAndCompute, CommsThenCompute };
  static int Opt;  
  static int Comms;
 };
@@ -49,17 +49,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
-  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
+   
 public:
 #ifdef GRID_SYCL
 #define SYCL_HACK
 #endif  
 #ifdef SYCL_HACK
  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
 			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
 #endif
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -112,6 +112,7 @@ void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  imported5d=tmp;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
 {
@@ -126,37 +127,6 @@ void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &inpu
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  Dminus(tmp,imported5d);
 }
 ////////////////////////////////////////////////////
 // Added for fourD pseudofermion det estimation
 ////////////////////////////////////////////////////
 template<class Impl>  
 void CayleyFermion5D<Impl>::ImportFourDimPseudoFermion(const FermionField &input4d,FermionField &imported5d)
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  conformable(imported5d.Grid(),this->FermionGrid());
  conformable(input4d.Grid()   ,this->GaugeGrid());
  tmp = Zero();
  InsertSlice(input4d, tmp, 0   , 0);
  InsertSlice(input4d, tmp, Ls-1, 0);
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, 0, 0);
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  imported5d=tmp;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::ExportFourDimPseudoFermion(const FermionField &solution5d,FermionField &exported4d)
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  tmp = solution5d;
  conformable(solution5d.Grid(),this->FermionGrid());
  conformable(exported4d.Grid(),this->GaugeGrid());
  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
  ExtractSlice(exported4d, tmp, 0, 0);
 }
 // Dminus
 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
@@ -672,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-#if (!defined(GRID_HIP))
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@@ -829,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  PropagatorField zz (UGrid);   zz=0.0;
+  LatticeInteger zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for (int s=0;s<Ls;s++) {
@@ -856,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif
-#if (!defined(GRID_HIP))
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
@@ -880,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  PropagatorField  zz (UGrid);   zz=0.0;
+  LatticeInteger zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for(int s=0;s<Ls;s++){
@@ -910,29 +880,17 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
  RealD sign = 1; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
    }
  }
  else if ( curr_type == Current::Tadpole ) {
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
      sign = -1;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
      assert(b==1 && c==0);
    }
  }
  for(int s=0;s<Ls;s++){
    int sp = (s+1)%Ls;
-    //    int sr = Ls-1-s;
+    int sr = Ls-1-s;
-    //    int srp= (sr+1)%Ls;
+    int srp= (sr+1)%Ls;
    // Mobius parameters
    auto b=this->bs[s];
@@ -949,7 +907,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
-    tmp    = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@@ -680,8 +680,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
-#undef STAG_VEC5D
+
 #ifdef STAG_VEC5D
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
@@ -791,7 +790,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
 #endif
 }
-#endif   
+   
 #define PERMUTE_DIR3 __asm__ (	\
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -32,50 +32,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-#ifdef GRID_SIMT
+#define LOAD_CHI(b)		\
 #define LOAD_CHI(ptype,b)			\
  const SiteSpinor & ref (b[offset]);				\
  Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane);	\
  Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane);	\
  Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
 #define LOAD_CHI_COMMS(b)		\
  const SiteSpinor & ref (b[offset]);	\
-  Chi_0=coalescedRead(ref()()(0),lane);	\
+    Chi_0=ref()()(0);\
-  Chi_1=coalescedRead(ref()()(1),lane);	\
+    Chi_1=ref()()(1);\
-  Chi_2=coalescedRead(ref()()(2),lane);
+    Chi_2=ref()()(2);
 #define PERMUTE_DIR(dir)	;
 #else
 #define LOAD_CHI(ptype,b)      LOAD_CHI_COMMS(b)
 #define LOAD_CHI_COMMS(b)		\
  const SiteSpinor & ref (b[offset]);	\
  Chi_0=ref()()(0);			\
  Chi_1=ref()()(1);			\
  Chi_2=ref()()(2);
 #define PERMUTE_DIR(dir)			\
  permute##dir(Chi_0,Chi_0);			\
  permute##dir(Chi_1,Chi_1);			\
  permute##dir(Chi_2,Chi_2);
 #endif
 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
  auto & ref(U[sU](A));			\
-    U_00=coalescedRead(ref()(0,0),lane);				\
+   Impl::loadLinkElement(U_00,ref()(0,0));      \
-    U_10=coalescedRead(ref()(1,0),lane);				\
+   Impl::loadLinkElement(U_10,ref()(1,0));      \
-    U_20=coalescedRead(ref()(2,0),lane);				\
+   Impl::loadLinkElement(U_20,ref()(2,0));      \
-    U_01=coalescedRead(ref()(0,1),lane);				\
+   Impl::loadLinkElement(U_01,ref()(0,1));      \
-    U_11=coalescedRead(ref()(1,1),lane);				\
+   Impl::loadLinkElement(U_11,ref()(1,1));      \
-    U_21=coalescedRead(ref()(2,1),lane);				\
+   Impl::loadLinkElement(U_21,ref()(2,1));      \
-    U_02=coalescedRead(ref()(0,2),lane);				\
+   Impl::loadLinkElement(U_02,ref()(0,2));     \
-    U_12=coalescedRead(ref()(1,2),lane);				\
+   Impl::loadLinkElement(U_12,ref()(1,2));     \
-    U_22=coalescedRead(ref()(2,2),lane);				\
+   Impl::loadLinkElement(U_22,ref()(2,2));     \
    UChi ## _0  = U_00*Chi_0;	       \
    UChi ## _1  = U_10*Chi_0;\
    UChi ## _2  = U_20*Chi_0;\
@@ -88,15 +63,15 @@ NAMESPACE_BEGIN(Grid);
 #define MULT_ADD(U,A,UChi)			\
  auto & ref(U[sU](A));			\
-    U_00=coalescedRead(ref()(0,0),lane);				\
+   Impl::loadLinkElement(U_00,ref()(0,0));      \
-    U_10=coalescedRead(ref()(1,0),lane);				\
+   Impl::loadLinkElement(U_10,ref()(1,0));      \
-    U_20=coalescedRead(ref()(2,0),lane);				\
+   Impl::loadLinkElement(U_20,ref()(2,0));      \
-    U_01=coalescedRead(ref()(0,1),lane);				\
+   Impl::loadLinkElement(U_01,ref()(0,1));      \
-    U_11=coalescedRead(ref()(1,1),lane);				\
+   Impl::loadLinkElement(U_11,ref()(1,1));      \
-    U_21=coalescedRead(ref()(2,1),lane);				\
+   Impl::loadLinkElement(U_21,ref()(2,1));      \
-    U_02=coalescedRead(ref()(0,2),lane);				\
+   Impl::loadLinkElement(U_02,ref()(0,2));     \
-    U_12=coalescedRead(ref()(1,2),lane);				\
+   Impl::loadLinkElement(U_12,ref()(1,2));     \
-    U_22=coalescedRead(ref()(2,2),lane);				\
+   Impl::loadLinkElement(U_22,ref()(2,2));     \
    UChi ## _0 += U_00*Chi_0;	       \
    UChi ## _1 += U_10*Chi_0;\
    UChi ## _2 += U_20*Chi_0;\
@@ -108,18 +83,24 @@ NAMESPACE_BEGIN(Grid);
    UChi ## _2 += U_22*Chi_2;
 #define PERMUTE_DIR(dir)			\
  permute##dir(Chi_0,Chi_0);			\
  permute##dir(Chi_1,Chi_1);			\
  permute##dir(Chi_2,Chi_2);
 #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);	\
  offset = SE->_offset;			\
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(Perm,in);						\
+    LOAD_CHI(in);					\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
  } else {							\
-    LOAD_CHI_COMMS(buf);					\
+    LOAD_CHI(buf);						\
  }								
 #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
@@ -135,18 +116,19 @@ NAMESPACE_BEGIN(Grid);
  }
 #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(Perm,in);					\
+    LOAD_CHI(in);				\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI_COMMS(buf);				\
+    LOAD_CHI(buf);					\
  }							\
  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
@@ -158,35 +140,13 @@ NAMESPACE_BEGIN(Grid);
  local  = SE->_is_local;				\
  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    { LOAD_CHI_COMMS(buf);	  }				\
+    { LOAD_CHI(buf);	  }					\
    { MULT_ADD(U,Dir,even); }					\
  }								
 #define HAND_DECLARATIONS(Simd) \
  Simd even_0;			\
  Simd even_1;			\
  Simd even_2;			\
  Simd odd_0;			\
  Simd odd_1;			\
  Simd odd_2;		        \
 		      		\
  Simd Chi_0;			\
  Simd Chi_1;			\
  Simd Chi_2;			\
 				\
  Simd U_00;			\
  Simd U_10;			\
  Simd U_20;			\
  Simd U_01;			\
  Simd U_11;			\
  Simd U_21;			\
  Simd U_02;			\
  Simd U_12;			\
  Simd U_22;			
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int sF, int sU, 
@@ -195,14 +155,28 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  Simd even_0; // 12 regs on knc
  Simd even_1;
  Simd even_2;
  Simd odd_0; // 12 regs on knc
  Simd odd_1;
  Simd odd_2;
-  const int Nsimd = SiteHalfSpinor::Nsimd();
+  Simd Chi_0;    // two spinor; 6 regs
-  const int lane=acceleratorSIMTlane(Nsimd);
+  Simd Chi_1;
-  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  Simd Chi_2;
-  HAND_DECLARATIONS(Simt);
+  
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
-  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  SiteSpinor result;
  calcSiteSpinor result;
  int offset,local,perm, ptype;
  StencilEntry *SE;
@@ -241,13 +215,13 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    coalescedWrite(out[sF],result);
+    vstream(out[sF],result);
  }
 }
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -256,13 +230,28 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
-  const int Nsimd = SiteHalfSpinor::Nsimd();
+  Simd even_0; // 12 regs on knc
-  const int lane=acceleratorSIMTlane(Nsimd);
+  Simd even_1;
-  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  Simd even_2;
-  HAND_DECLARATIONS(Simt);
+  Simd odd_0; // 12 regs on knc
  Simd odd_1;
  Simd odd_2;
-  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  Simd Chi_0;    // two spinor; 6 regs
-  calcSiteSpinor result;
+  Simd Chi_1;
  Simd Chi_2;
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
  SiteSpinor result;
  int offset, ptype, local, perm;
  StencilEntry *SE;
@@ -272,8 +261,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  //    int sF=s+LLs*sU;
  {
-    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@@ -305,13 +294,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    coalescedWrite(out[sF],result);
+    vstream(out[sF],result);
  }
 }
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -320,13 +309,28 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
-  const int Nsimd = SiteHalfSpinor::Nsimd();
+  Simd even_0; // 12 regs on knc
-  const int lane=acceleratorSIMTlane(Nsimd);
+  Simd even_1;
-  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  Simd even_2;
-  HAND_DECLARATIONS(Simt);
+  Simd odd_0; // 12 regs on knc
  Simd odd_1;
  Simd odd_2;
-  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  Simd Chi_0;    // two spinor; 6 regs
-  calcSiteSpinor result;
+  Simd Chi_1;
  Simd Chi_2;
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
  SiteSpinor result;
  int offset, ptype, local;
  StencilEntry *SE;
@@ -336,8 +340,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  //    int sF=s+LLs*sU;
  {
-    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@@ -370,7 +374,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      coalescedWrite(out[sF] , out(sF)+ result);
+      out[sF] = out[sF] + result;
    }
  }
 }
@@ -393,7 +397,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
 #undef HAND_DECLARATIONS
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -35,32 +35,39 @@ NAMESPACE_BEGIN(Grid);
 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    int perm= SE->_permute;						\
+    if (SE->_permute) {						\
-    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
+      chi_p = &chi;						\
      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
      chi_p = &in[SE->_offset];					\
    }								\
  } else {							\
-    chi = coalescedRead(buf[SE->_offset],lane);			\
+    chi_p = &buf[SE->_offset];					\
  }								\
-  acceleratorSynchronise();					\
+  multLink(Uchi, U[sU], *chi_p, Dir);			
  multLink(Uchi, U[sU], chi, Dir);			
 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    int perm= SE->_permute;						\
+    if (SE->_permute) {						\
-    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
+      chi_p = &chi;						\
      permute(chi,  in[SE->_offset], ptype);			\
    } else {							\
      chi_p = &in[SE->_offset];					\
    }								\
  } else if ( st.same_node[Dir] ) {				\
-    chi = coalescedRead(buf[SE->_offset],lane);                 \
+    chi_p = &buf[SE->_offset];					\
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], chi, Dir);				\
+    multLink(Uchi, U[sU], *chi_p, Dir);				\
  }
 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    chi = coalescedRead(buf[SE->_offset],lane);			\
+    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], chi, Dir);				\
+    multLink(Uchi, U[sU], *chi_p, Dir);				\
  }
 template <class Impl>
@@ -71,20 +78,18 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
-  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  const SiteSpinor *chi_p;
-  calcSpinor chi;
+  SiteSpinor chi;
-  calcSpinor Uchi;
+  SiteSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  //  for(int s=0;s<LLs;s++){
  //
@@ -113,7 +118,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
    if ( dag ) { 
      Uchi = - Uchi;
    } 
-    coalescedWrite(out[sF], Uchi,lane);
+    vstream(out[sF], Uchi);
  }
 };
@@ -121,20 +126,17 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
-						const FermionFieldView &in, FermionFieldView &out,int dag)
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
-{
+  const SiteSpinor *chi_p;
-  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  SiteSpinor chi;
-  calcSpinor chi;
+  SiteSpinor Uchi;
  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew ;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -163,7 +165,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
    if ( dag ) {
      Uchi = - Uchi;
    }
-    coalescedWrite(out[sF], Uchi,lane);
+    vstream(out[sF], Uchi);
  }
 };
@@ -172,21 +174,18 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
-						const FermionFieldView &in, FermionFieldView &out,int dag)
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
-{
+  const SiteSpinor *chi_p;
-  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  //  SiteSpinor chi;
-  calcSpinor chi;
+  SiteSpinor Uchi;
  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  int skew ;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -212,12 +211,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
    }
-    if ( nmu ) {
+    if ( nmu ) { 
-      auto _out = coalescedRead(out[sF],lane);
+      if ( dag ) { 
-      if ( dag ) {
+	out[sF] = out[sF] - Uchi;
 	coalescedWrite(out[sF], _out-Uchi,lane);
      } else { 
-	coalescedWrite(out[sF], _out+Uchi,lane);
+	out[sF] = out[sF] + Uchi;
      }
    }
  }
@@ -226,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-template <class Impl> 
+template <class Impl>
 void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
@@ -255,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
  });
-template <class Impl> 
+template <class Impl>
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -263,8 +261,6 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v , UUU, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
@@ -297,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
-template <class Impl> 
+template <class Impl>
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -305,8 +301,6 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  const int Nsimd = SiteHalfSpinor::Nsimd();
  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v ,   U, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -92,16 +92,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;
  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  Coordinate lcoor;
  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
-    thread_for(site, lvol, {
+    for (int site = 0; site < lvol; site++) {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
-      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      Qxinv = Zero();
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
@@ -122,21 +126,21 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
-    });
+    }
  }
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
 }
 template <class Impl>
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -51,9 +51,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p.locally_periodic,p),
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p.locally_periodic,p), // source is Even
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p.locally_periodic,p), // source is Odd
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
@@ -361,21 +361,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
  assert(  (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
 	 ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
         ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet) );
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) {
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  }
+  else 
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute ) {
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  }
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet ) {
    DhopInternalDirichletComms(st,lo,U,in,out,dag);
  }
  DhopTotalTime+=usecond();
 }
@@ -442,30 +431,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopComputeTime2+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalDirichletComms(StencilImpl & st, LebesgueOrder &lo,
 						       DoubledGaugeField & U,
 						       const FermionField &in, FermionField &out,int dag)
 {
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  accelerator_barrier();
  DhopComputeTime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -43,13 +43,13 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                                   GridRedBlackCartesian &Hgrid, RealD _mass,
                                   const ImplParams &p,
                                   const WilsonAnisotropyCoefficients &anis)
-  :
+  : 
    Kernels(p),
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
-    Stencil(&Fgrid, npoint, Even, directions, displacements,p.locally_periodic,p),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
-    StencilEven(&Hgrid, npoint, Even, directions,displacements,p.locally_periodic,p),  // source is Even
+    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
-    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p.locally_periodic,p),  // source is Odd
+    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
@@ -75,93 +75,8 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  StencilOdd.BuildSurfaceList(1,vol4);
 }
 template<class Impl>
 void WilsonFermion<Impl>::Report(void)
 {
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  RealD volume = 1;
  Coordinate latt = _grid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _grid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    // how to count flops here?
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call               ? : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node      ? : " << mflops/NP << std::endl;
    // how to count flops here?
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)        ? : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion<Impl>::ZeroCounters(void) {
  DhopCalls       = 0; // ok
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0; // ok
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template <class Impl>
-void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) 
 {
  GaugeField HUmu(_Umu.Grid());
@@ -192,7 +107,7 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 /////////////////////////////
 template <class Impl>
-void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
@@ -200,7 +115,7 @@ void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out)
 }
 template <class Impl>
-void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
@@ -208,7 +123,7 @@ void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 }
 template <class Impl>
-void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
 {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
@@ -218,7 +133,7 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
 }
 template <class Impl>
-void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
 {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
@@ -226,9 +141,9 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
    DhopOE(in, out, DaggerYes);
  }
 }
-
+  
 template <class Impl>
-void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(diag_mass);
@@ -236,80 +151,80 @@ void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 }
 template <class Impl>
-void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }
 template<class Impl>
-void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  out = (1.0/(diag_mass))*in;
 }
-
+  
 template<class Impl>
-void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in,out);
 }
 template<class Impl>
 void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
-{
+{  
  typedef typename FermionField::vector_type vector_type;
  typedef typename FermionField::scalar_type ScalComplex;
  typedef Lattice<iSinglet<vector_type> > LatComplex;
-
+  
-  // what type LatticeComplex
+  // what type LatticeComplex 
  conformable(_grid,out.Grid());
-
+  
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
-
+  
  Coordinate latt_size   = _grid->_fdimensions;
-
+  
  FermionField   num  (_grid); num  = Zero();
  LatComplex    wilson(_grid); wilson= Zero();
  LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
-
+  
  LatComplex denom(_grid); denom= Zero();
-  LatComplex kmu(_grid);
+  LatComplex kmu(_grid); 
  ScalComplex ci(0.0,1.0);
  // momphase = n * 2pi / L
  for(int mu=0;mu<Nd;mu++) {
-
+    
    LatticeCoordinate(kmu,mu);
-
+    
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-
+    
    kmu = TwoPiL * kmu;
    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
-
+    
    wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
-
+    
    num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);    // derivative term
-
+    
    denom=denom + sin(kmu)*sin(kmu);
  }
-
+  
  wilson = wilson + _m;     // 2 sin^2 k/2 + m
-
+  
  num   = num + wilson*in;     // -i gmu sin k + 2 sin^2 k/2 + m
-
+  
  denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
-
+  
  denom= one/denom;
-
+  
  out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
-
+  
 }
-
+  
 ///////////////////////////////////
 // Internal
@@ -319,7 +234,6 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
  DerivCalls++;
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@@ -328,11 +242,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Atilde(B.Grid());
  Atilde = A;
  DerivCommTime-=usecond();
  st.HaloExchange(B, compressor);
  DerivCommTime+=usecond();
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
@@ -340,7 +251,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;
    DerivDhopComputeTime -= usecond();
    int Ls=1;
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@@ -348,13 +258,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
    DerivDhopComputeTime += usecond();
  }
  DerivComputeTime += usecond();
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
 {
  conformable(U.Grid(), _grid);
  conformable(U.Grid(), V.Grid());
@@ -366,13 +274,13 @@ void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, cons
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
 {
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
  // Motivation: look at the SchurDiff operator
-
+  
  assert(V.Checkerboard() == Even);
  assert(U.Checkerboard() == Odd);
  mat.Checkerboard() = Odd;
@@ -381,7 +289,7 @@ void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, co
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
 {
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
@@ -395,9 +303,8 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 }
 template <class Impl>
-void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
+void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@@ -407,9 +314,8 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
+void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -420,9 +326,8 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
+void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) 
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -433,18 +338,18 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
 }
 template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
-void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
+void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
  DhopDirAll(in, out);
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
+void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
@@ -456,12 +361,12 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
-  assert((out.size()==8)||(out.size()==9));
+  assert((out.size()==8)||(out.size()==9)); 
  for(int dir=0;dir<Nd;dir++){
    for(int disp=-1;disp<=1;disp+=2){
@@ -474,7 +379,7 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
  }
 }
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag)
+void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
 {
  int Ls=1;
  uint64_t Nsite=in.oSites();
@@ -485,32 +390,22 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag)
+                                       FermionField &out, int dag) 
 {
-  DhopTotalTime-=usecond();
+#ifdef GRID_OMP
-
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
  assert(  (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
 	 ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
         ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet) );
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) {
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  }
+  else
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute ) {
+#endif 
    DhopInternalSerial(st,lo,U,in,out,dag);
-  }
+
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet ) {
    DhopInternalDirichletComms(st,lo,U,in,out,dag);
  }
  DhopTotalTime+=usecond();
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 						      DoubledGaugeField &U,
 						      const FermionField &in,
-						      FermionField &out, int dag)
+						      FermionField &out, int dag) 
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
@@ -522,105 +417,63 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  /////////////////////////////
  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
  DhopFaceTime-=usecond();
  st.HaloGather(in,compressor);
  DhopFaceTime+=usecond();
  DhopCommTime -=usecond();
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+=usecond();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
-  }
+  } 
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
  DhopCommTime   +=usecond();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 };
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalDirichletComms(StencilImpl &st, LebesgueOrder &lo,
 						     DoubledGaugeField &U,
 						     const FermionField &in,
 						     FermionField &out, int dag)
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
  int len =  U.Grid()->oSites();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
 };
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag)
+                                       FermionField &out, int dag) 
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
  DhopCommTime-=usecond();
  st.HaloExchange(in, compressor);
  DhopCommTime+=usecond();
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 };
 /*Change ends */
 /*******************************************************************************
 * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current
+ * to make a conserved current sink or inserting the conserved current 
 * sequentially.
 ******************************************************************************/
 template <class Impl>
@@ -640,12 +493,12 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 template <class Impl>
-void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                              PropagatorField &q_out,
                                              PropagatorField &src,
                                              Current curr_type,
                                              unsigned int mu,
-                                              unsigned int tmin,
+                                              unsigned int tmin, 
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
@@ -1,450 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
    Copyright (C) 2020
 Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 //#if defined(A64FXASM)
 #if defined(A64FX)
 // safety include
 #include <arm_sve.h>
 // undefine everything related to kernels
 #include <simd/Fujitsu_A64FX_undef.h>
    ///////////////////////////////////////////////////////////
    // If we are A64FX specialise the single precision routine
    ///////////////////////////////////////////////////////////
 #if defined(DSLASHINTRIN)
 //#pragma message ("A64FX Dslash: intrin")
 #include <simd/Fujitsu_A64FX_intrin_single.h>
 #else
 #pragma message ("A64FX Dslash: asm")
 #include <simd/Fujitsu_A64FX_asm_single.h>
 #endif
 /// Switch off the 5d vectorised code optimisations
 #undef DWFVEC5D
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, undag Kernel, single
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, single
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 //#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 //template<> void
 //WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 //						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 //#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // undefine
 #include <simd/Fujitsu_A64FX_undef.h>
 ///////////////////////////////////////////////////////////
 // If we are A64FX specialise the double precision routine
 ///////////////////////////////////////////////////////////
 #if defined(DSLASHINTRIN)
 #include <simd/Fujitsu_A64FX_intrin_double.h>
 #else
 #include <simd/Fujitsu_A64FX_asm_double.h>
 #endif
 // former KNL
 //#define MAYBEPERM(A,perm) if (perm) { A ; }
 //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 //#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
 #define INTERIOR_AND_EXTERIOR
 #undef  INTERIOR
 #undef  EXTERIOR
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, undag Kernel, double
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, double
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 // template<> void
 // WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 // 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 // #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 // undefs
 #include <simd/Fujitsu_A64FX_undef.h>
 #endif //A64FXASM
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h
@@ -74,15 +74,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//
+
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -97,15 +97,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//
+
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
@@ -121,15 +121,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//
+
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, single
@@ -148,15 +148,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//
+
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -171,15 +171,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//
+
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -194,15 +194,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//
+				    
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef MAYBEPERM
 #undef MULT_2SPIN
@@ -228,14 +228,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeF
 							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -249,14 +249,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGau
 							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -273,15 +273,15 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGau
 							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//
+				    
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 // Ls vectorised, dag Kernel, single
@@ -299,14 +299,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGau
 							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -320,14 +320,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, Doubled
 							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -341,14 +341,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
 							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif  // VEC 5D
@@ -392,14 +392,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -413,14 +413,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -434,14 +434,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, single
@@ -459,14 +459,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -480,14 +480,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -501,14 +501,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef MAYBEPERM
 #undef MULT_2SPIN
@@ -533,14 +533,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeF
 							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -554,14 +554,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGau
 							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -577,14 +577,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGau
 							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
-//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 /////////////////////////////////////////////////////////////////
 // Ls vectorised, dag Kernel, single
@@ -602,14 +602,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGau
 							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
@@ -623,14 +623,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, Doubled
 							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
@@ -645,14 +645,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, Doubled
 							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-//template<> void
+template<> void 
-//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
+#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif  // VEC 5D
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
@@ -1,395 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: WilsonKernelsAsmBodyA64FX.h
    Copyright (C) 2020
 Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 // GCC 10 messes up SVE instruction scheduling using -O3, but
 // -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
 // performance now is better than armclang 20.2
 #ifdef KERNEL_DAG
 #define DIR0_PROJ    XP_PROJ
 #define DIR1_PROJ    YP_PROJ
 #define DIR2_PROJ    ZP_PROJ
 #define DIR3_PROJ    TP_PROJ
 #define DIR4_PROJ    XM_PROJ
 #define DIR5_PROJ    YM_PROJ
 #define DIR6_PROJ    ZM_PROJ
 #define DIR7_PROJ    TM_PROJ
 #define DIR0_RECON   XP_RECON
 #define DIR1_RECON   YP_RECON_ACCUM
 #define DIR2_RECON   ZP_RECON_ACCUM
 #define DIR3_RECON   TP_RECON_ACCUM
 #define DIR4_RECON   XM_RECON_ACCUM
 #define DIR5_RECON   YM_RECON_ACCUM
 #define DIR6_RECON   ZM_RECON_ACCUM
 #define DIR7_RECON   TM_RECON_ACCUM
 #else
 #define DIR0_PROJ    XM_PROJ
 #define DIR1_PROJ    YM_PROJ
 #define DIR2_PROJ    ZM_PROJ
 #define DIR3_PROJ    TM_PROJ
 #define DIR4_PROJ    XP_PROJ
 #define DIR5_PROJ    YP_PROJ
 #define DIR6_PROJ    ZP_PROJ
 #define DIR7_PROJ    TP_PROJ
 #define DIR0_RECON   XM_RECON
 #define DIR1_RECON   YM_RECON_ACCUM
 #define DIR2_RECON   ZM_RECON_ACCUM
 #define DIR3_RECON   TM_RECON_ACCUM
 #define DIR4_RECON   XP_RECON_ACCUM
 #define DIR5_RECON   YP_RECON_ACCUM
 #define DIR6_RECON   ZP_RECON_ACCUM
 #define DIR7_RECON   TP_RECON_ACCUM
 #endif
 //using namespace std;
 #undef SHOW
 //#define SHOW
 #undef WHERE
 #ifdef INTERIOR_AND_EXTERIOR
 #define WHERE "INT_AND_EXT"
 #endif
 #ifdef INTERIOR
 #define WHERE "INT"
 #endif
 #ifdef EXTERIOR
 #define WHERE "EXT"
 #endif
 //#pragma message("here")
 ////////////////////////////////////////////////////////////////////////////////
 // Comms then compute kernel
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef INTERIOR_AND_EXTERIOR
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
      basep = st.GetPFInfo(nent,plocal); nent++;			\
      if ( local ) {							            \
    LOAD_CHIMU(base);                                       \
    LOAD_TABLE(PERMUTE_DIR);                                \
    PROJ;							                        \
    MAYBEPERM(PERMUTE_DIR,perm);					        \
      } else {								                \
 	  LOAD_CHI(base);							                \
      }									                    \
      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
    MULT_2SPIN_1(Dir);					                    \
    PREFETCH_CHIMU(base);                                   \
    PREFETCH_CHIMU_L2(basep);                               \
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
    MULT_2SPIN_2;					                        \
    if (s == 0) {                                           \
      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
    }                                                       \
    RECON;								                    \
 /*
 NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
    though I expected that it would improve on performance
 */
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
  PREFETCH1_CHIMU(base);						            \
  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
 #define RESULT(base,basep) SAVE_RESULT(base,basep);
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Pre comms kernel -- prefetch like normal because it is mostly right
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef INTERIOR
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
      basep = st.GetPFInfo(nent,plocal); nent++;			\
      if ( local ) {							\
  LOAD_CHIMU(base);                                       \
  LOAD_TABLE(PERMUTE_DIR);                                \
  PROJ;							                        \
  MAYBEPERM(PERMUTE_DIR,perm);					        \
      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
      if ( local || st.same_node[Dir] ) {				\
  MULT_2SPIN_1(Dir);					                    \
  MULT_2SPIN_2;					                        \
  RECON;								\
      }									\
  base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
  PREFETCH_CHIMU(base);						\
  PREFETCH_CHIMU_L2(basep);                               \
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
  PREFETCH1_CHIMU(base);						\
  { ZERO_PSI; }								\
  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
 #define RESULT(base,basep) SAVE_RESULT(base,basep);
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Post comms kernel
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef EXTERIOR
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
  if((!local)&&(!st.same_node[Dir]) ) {					\
    LOAD_CHI(base);							\
    MULT_2SPIN_1(Dir);					                    \
    MULT_2SPIN_2;					                        \
    RECON;								\
    nmu++;								\
  }
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
  nmu=0;								\
  { ZERO_PSI;}								\
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
  if((!local)&&(!st.same_node[Dir]) ) {					\
    LOAD_CHI(base);							\
    MULT_2SPIN_1(Dir);					                    \
    MULT_2SPIN_2;					                        \
    RECON;								\
    nmu++;								\
  }
 #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
 #endif
 {
  int nmu;
  int local,perm, ptype;
  uint64_t base;
  uint64_t basep;
  const uint64_t plocal =(uint64_t) & in[0];
  MASK_REGS;
  int nmax=U.oSites();
  for(int site=0;site<Ns;site++) {
 #ifndef EXTERIOR
    //    int sU =lo.Reorder(ssU);
    int sU =ssU;
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
    //    int sUn=lo.Reorder(ssn);
    int sUn=ssn;
 #else
    int sU =ssU;
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
    int sUn=ssn;
 #endif
    for(int s=0;s<Ls;s++) {
      ss =sU*Ls+s;
      ssn=sUn*Ls+s;
      int  ent=ss*8;// 2*Ndim
      int nent=ssn*8;
      uint64_t delta_base, delta_base_p;
   ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
 #ifdef SHOW
      float rescale = 64. * 12.;
      std::cout << "=================================================================" << std::endl;
      std::cout << "ss = " << ss << "   ssn = " << ssn << std::endl;
      std::cout << "sU = " << sU << "   ssU = " << ssU << std::endl;
      std::cout << " " << std::endl;
      std::cout << "Dir = " << Xp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Yp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Zp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Tp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Xm << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      // DC ZVA test
      // { uint64_t basestore = (uint64_t)&out[ss];
      //   PREFETCH_RESULT_L2_STORE(basestore); }
      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Ym << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      // DC ZVA test
      //{ uint64_t basestore = (uint64_t)&out[ss];
      //  PREFETCH_RESULT_L2_STORE(basestore); }
      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Zm << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      // DC ZVA test
      //{ uint64_t basestore = (uint64_t)&out[ss];
      //  PREFETCH_RESULT_L2_STORE(basestore); }
      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Tm << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
 #ifdef EXTERIOR
      if (nmu==0) break;
      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
 #endif
      base = (uint64_t) &out[ss];
      basep= st.GetPFInfo(nent,plocal); ent++;
      basep = (uint64_t) &out[ssn];
      //PREFETCH_RESULT_L1_STORE(base);
      RESULT(base,basep);
 #ifdef SHOW
      std::cout << "Dir = FINAL        " <<  WHERE<< std::endl;;
      base_ss = base;
      std::cout << "base              = " << (base - (uint64_t) &out[0])/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
    }
    ssU++;
    UNLOCK_GAUGE(0);
  }
 }
 #undef DIR0_PROJ
 #undef DIR1_PROJ
 #undef DIR2_PROJ
 #undef DIR3_PROJ
 #undef DIR4_PROJ
 #undef DIR5_PROJ
 #undef DIR6_PROJ
 #undef DIR7_PROJ
 #undef DIR0_RECON
 #undef DIR1_RECON
 #undef DIR2_RECON
 #undef DIR3_RECON
 #undef DIR4_RECON
 #undef DIR5_RECON
 #undef DIR6_RECON
 #undef DIR7_RECON
 #undef ASM_LEG
 #undef ASM_LEG_XP
 #undef RESULT
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	d8c0c0ba0a	Fix and compiles	2020-08-12 14:35:08 -04:00
Peter Boyle	c6cf918d4c	Typo	2020-08-12 14:24:39 -04:00
Peter Boyle	6d0a907c5c	first try at A2A four quark offload	2020-08-12 14:17:46 -04:00