Fixed test for very much non-unit det

Merge branch 'develop' into feature/conjugate-bc-dirs
Gparity fix, and plaquette IO
2025-10-14 05:04:42 +01:00 · 2021-01-15 09:16:02 -05:00 · 2021-01-14 21:01:22 -05:00 · 2021-01-14 21:00:36 -05:00 · 2021-01-14 20:49:13 -05:00 · 2021-01-14 20:48:35 -05:00
223 changed files with 8502 additions and 5480 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,11 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
-      env: PREC=single
-    - os:        osx
-      osx_image: xcode8.3
-      compiler: clang
-      env: PREC=double
      
 before_install:
    - export GRIDDIR=`pwd`
@@ -55,7 +50,7 @@ script:
    - make -j4
    - make install
    - cd $CWD/build
-    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
+    - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -37,7 +37,9 @@ directory
 #endif

 //disables and intel compiler specific warning (in json.hpp)
+#ifdef __ICC
 #pragma warning disable 488  
+#endif

 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
--- a/Grid/GridStd.h
+++ b/Grid/GridStd.h
@@ -28,4 +28,7 @@
 ///////////////////
 #include "Config.h"

+#ifdef TOFU
+#undef GRID_COMMS_THREADS
+#endif
 #endif /* GRID_STD_H */
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -21,6 +21,7 @@ if BUILD_HDF5
  extra_headers+=serialisation/Hdf5Type.h
 endif

+
 all: version-cache Version.h

 version-cache:
@@ -53,6 +54,17 @@ Version.h: version-cache
 include Make.inc
 include Eigen.inc

+extra_sources+=$(ZWILS_FERMION_FILES)
+extra_sources+=$(WILS_FERMION_FILES)
+extra_sources+=$(STAG_FERMION_FILES)
+if BUILD_GPARITY
+  extra_sources+=$(GP_FERMION_FILES)
+endif
+if BUILD_FERMION_REPS
+  extra_sources+=$(ADJ_FERMION_FILES)
+  extra_sources+=$(TWOIND_FERMION_FILES)
+endif
+
 lib_LIBRARIES = libGrid.a

 CCFILES += $(extra_sources)
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H

+#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)

 NAMESPACE_BEGIN(Grid);

@@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
 class Geometry {
 public:
  int npoint;
+  int base;
  std::vector<int> directions   ;
  std::vector<int> displacements;
+  std::vector<int> points_dagger;

  Geometry(int _d)  {
    
-    int base = (_d==5) ? 1:0;
+    base = (_d==5) ? 1:0;

    // make coarse grid stencil for 4d , not 5d
    if ( _d==5 ) _d=4;
@@ -72,16 +75,51 @@ public:
    npoint = 2*_d+1;
    directions.resize(npoint);
    displacements.resize(npoint);
+    points_dagger.resize(npoint);
    for(int d=0;d<_d;d++){
      directions[d   ] = d+base;
      directions[d+_d] = d+base;
      displacements[d  ] = +1;
      displacements[d+_d]= -1;
+      points_dagger[d   ] = d+_d;
+      points_dagger[d+_d] = d;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
+    points_dagger[2*_d]=2*_d;
  }

+  int point(int dir, int disp) {
+    assert(disp == -1 || disp == 0 || disp == 1);
+    assert(base+0 <= dir && dir < base+4);
+
+    // directions faster index = new indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  1  2  3  0  1  2  3  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  2  3  4  1  2  3  4  0
+    // disp +1 +1 +1 +1 -1 -1 -1 -1  0
+
+    // displacements faster index = old indexing
+    // 4d (base = 0):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   0  0  1  1  2  2  3  3  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+    // 5d (base = 1):
+    // point 0  1  2  3  4  5  6  7  8
+    // dir   1  1  2  2  3  3  4  4  0
+    // disp +1 -1 +1 -1 +1 -1 +1 -1  0
+
+    if(dir == 0 and disp == 0)
+      return 8;
+    else // New indexing
+      return (1 - disp) / 2 * 4 + dir - base;
+    // else // Old indexing
+    //   return (4 * (dir - base) + 1 - disp) / 2;
+  }
 };
  
 template<class Fobj,class CComplex,int nbasis>
@@ -258,7 +296,7 @@ public:
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
-class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
    
  typedef iVector<CComplex,nbasis >           siteVector;
@@ -268,33 +306,59 @@ public:
  typedef iMatrix<CComplex,nbasis >  Cobj;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
+  typedef CoarseVector FermionField;
+
+  // enrich interface, use default implementation as in FermionOperator ///////
+  void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
+  void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
+  void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
+  void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
+  void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
+  void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };

  ////////////////////
  // Data members
  ////////////////////
  Geometry         geom;
  GridBase *       _grid; 
+  GridBase*        _cbgrid;
  int hermitian;

  CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CartesianStencil<siteVector,siteVector,int> StencilEven;
+  CartesianStencil<siteVector,siteVector,int> StencilOdd;

  std::vector<CoarseMatrix> A;
-    
+  std::vector<CoarseMatrix> Aeven;
+  std::vector<CoarseMatrix> Aodd;
+
+  CoarseMatrix AselfInv;
+  CoarseMatrix AselfInvEven;
+  CoarseMatrix AselfInvOdd;
+
+  Vector<RealD> dag_factor;
+
  ///////////////////////
  // Interface
  ///////////////////////
  GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
+  GridBase * RedBlackGrid()     { return _cbgrid; };
+
+  int ConstEE() { return 0; }

  void M (const CoarseVector &in, CoarseVector &out)
  {
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();

    SimpleCompressor<siteVector> compressor;

    Stencil.HaloExchange(in,compressor);
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
+    autoView( Stencil_v  , Stencil, AcceleratorRead);
+    auto& geom_v = geom;
    typedef LatticeView<Cobj> Aview;
      
    Vector<Aview> AcceleratorViewContainer;
@@ -316,14 +380,14 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int point=0;point<geom.npoint;point++){
+      for(int point=0;point<geom_v.npoint;point++){

-	SE=Stencil.GetEntry(ptype,point,ss);
+	SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
 	if(SE->_is_local) { 
 	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
+	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
 	}
 	acceleratorSynchronise();

@@ -344,12 +408,72 @@ public:
      return M(in,out);
    } else {
      // corresponds to Galerkin coarsening
-      CoarseVector tmp(Grid());
-      G5C(tmp, in); 
-      M(tmp, out);
-      G5C(out, out);
+      return MdagNonHermitian(in, out);
    }
  };
+
+  void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
+  {
+    conformable(_grid,in.Grid());
+    conformable(in.Grid(),out.Grid());
+    out.Checkerboard() = in.Checkerboard();
+
+    SimpleCompressor<siteVector> compressor;
+
+    Stencil.HaloExchange(in,compressor);
+    autoView( in_v , in, AcceleratorRead);
+    autoView( out_v , out, AcceleratorWrite);
+    autoView( Stencil_v  , Stencil, AcceleratorRead);
+    auto& geom_v = geom;
+    typedef LatticeView<Cobj> Aview;
+
+    Vector<Aview> AcceleratorViewContainer;
+
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    int osites=Grid()->oSites();
+
+    Vector<int> points(geom.npoint, 0);
+    for(int p=0; p<geom.npoint; p++)
+      points[p] = geom.points_dagger[p];
+
+    RealD* dag_factor_p = &dag_factor[0];
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
+      int ptype;
+      StencilEntry *SE;
+
+      for(int p=0;p<geom_v.npoint;p++){
+        int point = points[p];
+
+	SE=Stencil_v.GetEntry(ptype,point,ss);
+
+	if(SE->_is_local) {
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+	} else {
+	  nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
+	}
+	acceleratorSynchronise();
+
+	for(int bb=0;bb<nbasis;bb++) {
+	  res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+	}
+      }
+      coalescedWrite(out_v[ss](b),res);
+      });
+
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
+  }
+
  void MdirComms(const CoarseVector &in)
  {
    SimpleCompressor<siteVector> compressor;
@@ -359,6 +483,7 @@ public:
  {
    conformable(_grid,in.Grid());
    conformable(_grid,out.Grid());
+    out.Checkerboard() = in.Checkerboard();

    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -367,6 +492,7 @@ public:

    autoView( out_v , out, AcceleratorWrite);
    autoView( in_v  , in, AcceleratorRead);
+    autoView( Stencil_v  , Stencil, AcceleratorRead);

    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
@@ -380,12 +506,12 @@ public:
      int ptype;
      StencilEntry *SE;

-      SE=Stencil.GetEntry(ptype,point,ss);
+      SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
      if(SE->_is_local) { 
 	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
      } else {
-	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
+	nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
      }
      acceleratorSynchronise();

@@ -413,34 +539,7 @@ public:

    this->MdirComms(in);

-    int ndim = in.Grid()->Nd();
-
-    //////////////
-    // 4D action like wilson
-    // 0+ => 0 
-    // 0- => 1
-    // 1+ => 2 
-    // 1- => 3
-    // etc..
-    //////////////
-    // 5D action like DWF
-    // 1+ => 0 
-    // 1- => 1
-    // 2+ => 2 
-    // 2- => 3
-    // etc..
-    auto point = [dir, disp, ndim](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else if ( ndim==4 ) { 
-	return (4 * dir + 1 - disp) / 2;
-      } else { 
-	return (4 * (dir-1) + 1 - disp) / 2;
-      }
-    }();
-
-    MdirCalc(in,out,point);
-
+    MdirCalc(in,out,geom.point(dir,disp));
  };

  void Mdiag(const CoarseVector &in, CoarseVector &out)
@@ -449,23 +548,296 @@ public:
    MdirCalc(in, out, point); // No comms
  };

-  
- CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	: 
+  void Mooee(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerNo, InverseNo);
+  }

+  void MooeeInv(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerNo, InverseYes);
+  }
+
+  void MooeeDag(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerYes, InverseNo);
+  }
+
+  void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
+    MooeeInternal(in, out, DaggerYes, InverseYes);
+  }
+
+  void Meooe(const CoarseVector &in, CoarseVector &out) {
+    if(in.Checkerboard() == Odd) {
+      DhopEO(in, out, DaggerNo);
+    } else {
+      DhopOE(in, out, DaggerNo);
+    }
+  }
+
+  void MeooeDag(const CoarseVector &in, CoarseVector &out) {
+    if(in.Checkerboard() == Odd) {
+      DhopEO(in, out, DaggerYes);
+    } else {
+      DhopOE(in, out, DaggerYes);
+    }
+  }
+
+  void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
+    conformable(in.Grid(), _grid); // verifies full grid
+    conformable(in.Grid(), out.Grid());
+
+    out.Checkerboard() = in.Checkerboard();
+
+    DhopInternal(Stencil, A, in, out, dag);
+  }
+
+  void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
+    conformable(in.Grid(), _cbgrid);    // verifies half grid
+    conformable(in.Grid(), out.Grid()); // drops the cb check
+
+    assert(in.Checkerboard() == Even);
+    out.Checkerboard() = Odd;
+
+    DhopInternal(StencilEven, Aodd, in, out, dag);
+  }
+
+  void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
+    conformable(in.Grid(), _cbgrid);    // verifies half grid
+    conformable(in.Grid(), out.Grid()); // drops the cb check
+
+    assert(in.Checkerboard() == Odd);
+    out.Checkerboard() = Even;
+
+    DhopInternal(StencilOdd, Aeven, in, out, dag);
+  }
+
+  void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
+    out.Checkerboard() = in.Checkerboard();
+    assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+
+    CoarseMatrix *Aself = nullptr;
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
+        DselfInternal(StencilOdd, *Aself, in, out, dag);
+      } else {
+        Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
+        DselfInternal(StencilEven, *Aself, in, out, dag);
+      }
+    } else {
+      Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
+      DselfInternal(Stencil, *Aself, in, out, dag);
+    }
+    assert(Aself != nullptr);
+  }
+
+  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
+                       const CoarseVector &in, CoarseVector &out, int dag) {
+    int point = geom.npoint-1;
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( in_v,  in,  AcceleratorRead);
+    autoView( st_v,  st,  AcceleratorRead);
+    autoView( a_v,   a,   AcceleratorRead);
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    RealD* dag_factor_p = &dag_factor[0];
+
+    if(dag) {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        SE=st_v.GetEntry(ptype,point,ss);
+
+        if(SE->_is_local) {
+          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+        } else {
+          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+        }
+        acceleratorSynchronise();
+
+        for(int bb=0;bb<nbasis;bb++) {
+          res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    } else {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        SE=st_v.GetEntry(ptype,point,ss);
+
+        if(SE->_is_local) {
+          nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+        } else {
+          nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+        }
+        acceleratorSynchronise();
+
+        for(int bb=0;bb<nbasis;bb++) {
+          res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    }
+  }
+
+  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
+                    const CoarseVector &in, CoarseVector &out, int dag) {
+    SimpleCompressor<siteVector> compressor;
+
+    st.HaloExchange(in,compressor);
+    autoView( in_v,  in,  AcceleratorRead);
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( st_v , st,  AcceleratorRead);
+    typedef LatticeView<Cobj> Aview;
+
+    // determine in what order we need the points
+    int npoint = geom.npoint-1;
+    Vector<int> points(npoint, 0);
+    for(int p=0; p<npoint; p++)
+      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    RealD* dag_factor_p = &dag_factor[0];
+
+    if(dag) {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        for(int p=0;p<npoint;p++){
+          int point = points[p];
+          SE=st_v.GetEntry(ptype,point,ss);
+
+          if(SE->_is_local) {
+            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+          } else {
+            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+          }
+          acceleratorSynchronise();
+
+          for(int bb=0;bb<nbasis;bb++) {
+            res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+          }
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    } else {
+      accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
+        int ss = sss/nbasis;
+        int b  = sss%nbasis;
+        calcComplex res = Zero();
+        calcVector nbr;
+        int ptype;
+        StencilEntry *SE;
+
+        for(int p=0;p<npoint;p++){
+          int point = points[p];
+          SE=st_v.GetEntry(ptype,point,ss);
+
+          if(SE->_is_local) {
+            nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
+          } else {
+            nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
+          }
+          acceleratorSynchronise();
+
+          for(int bb=0;bb<nbasis;bb++) {
+            res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+          }
+        }
+        coalescedWrite(out_v[ss](b),res);
+      });
+    }
+
+    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
+  }
+  
+  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
    _grid(&CoarseGrid),
+    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-      A(geom.npoint,&CoarseGrid)
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    A(geom.npoint,&CoarseGrid),
+    Aeven(geom.npoint,_cbgrid),
+    Aodd(geom.npoint,_cbgrid),
+    AselfInv(&CoarseGrid),
+    AselfInvEven(_cbgrid),
+    AselfInvOdd(_cbgrid),
+    dag_factor(nbasis*nbasis)
  {
+    fillFactor();
  };

+  CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:
+
+    _grid(&CoarseGrid),
+    _cbgrid(&CoarseRBGrid),
+    geom(CoarseGrid._ndimension),
+    hermitian(hermitian_),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    A(geom.npoint,&CoarseGrid),
+    Aeven(geom.npoint,&CoarseRBGrid),
+    Aodd(geom.npoint,&CoarseRBGrid),
+    AselfInv(&CoarseGrid),
+    AselfInvEven(&CoarseRBGrid),
+    AselfInvOdd(&CoarseRBGrid),
+    dag_factor(nbasis*nbasis)
+  {
+    fillFactor();
+  };
+
+  void fillFactor() {
+    Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
+    if(!hermitian) {
+      const int nb = nbasis/2;
+      dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
+      dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
+    }
+
+    // GPU readable prefactor
+    thread_for(i, nbasis*nbasis, {
+      int j = i/nbasis;
+      int k = i%nbasis;
+      dag_factor[i] = dag_factor_eigen(j, k);
+    });
+  }
+
  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
    typedef typename Fobj::scalar_type scalar_type;

+    std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
+
    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);

@@ -496,11 +868,13 @@ public:

    CoarseScalar InnerProd(Grid()); 

+    std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
    // Orthogonalise the subblocks over the basis
    blockOrthogonalise(InnerProd,Subspace.subspace);

    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
+    std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
    int self_stencil=-1;
    for(int p=0;p<geom.npoint;p++)
    { 
@@ -539,7 +913,7 @@ public:

      phi=Subspace.subspace[i];

-      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
      linop.OpDirAll(phi,Mphi_p);
      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);

@@ -568,6 +942,18 @@ public:
 	    autoView( A_self  , A[self_stencil], AcceleratorWrite);

 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
+	    if ( hermitian && (disp==-1) ) {
+	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
+		int dirp   = geom.directions[pp];
+		int dispp  = geom.displacements[pp];
+		if ( (dirp==dir) && (dispp==1) ){
+		  auto sft = conjugate(Cshift(oZProj,dir,1));
+		  autoView( sft_v    ,  sft  , AcceleratorWrite);
+		  autoView( A_pp     ,  A[pp], AcceleratorWrite);
+		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
+		}
+	      }
+	    }

 	  }
 	}
@@ -606,28 +992,54 @@ public:
    }
    if(hermitian) {
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
-      ForceHermitian();
    }
+
+    InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
+    FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
  }

-  void ForceHermitian(void) {
-    CoarseMatrix Diff  (Grid());
-    for(int p=0;p<geom.npoint;p++){
-      int dir   = geom.directions[p];
-      int disp  = geom.displacements[p];
-      if(disp==-1) {
-	// Find the opposite link
-	for(int pp=0;pp<geom.npoint;pp++){
-	  int dirp   = geom.directions[pp];
-	  int dispp  = geom.displacements[pp];
-	  if ( (dirp==dir) && (dispp==1) ){
-	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
-	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
-	    A[pp] = adj(Cshift(A[p],dir,1));
-	  }
-	}
-      }
+  void InvertSelfStencilLink() {
+    std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
+    int localVolume = Grid()->lSites();
+
+    typedef typename Cobj::scalar_object scalar_object;
+
+    autoView(Aself_v,    A[geom.npoint-1], CpuRead);
+    autoView(AselfInv_v, AselfInv,         CpuWrite);
+    thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
+      Eigen::MatrixXcd selfLinkEigen    = Eigen::MatrixXcd::Zero(nbasis, nbasis);
+      Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
+
+      scalar_object selfLink    = Zero();
+      scalar_object selfLinkInv = Zero();
+
+      Coordinate lcoor;
+
+      Grid()->LocalIndexToLocalCoor(site, lcoor);
+      peekLocalSite(selfLink, Aself_v, lcoor);
+
+      for (int i = 0; i < nbasis; ++i)
+        for (int j = 0; j < nbasis; ++j)
+          selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
+
+      selfLinkInvEigen = selfLinkEigen.inverse();
+
+      for(int i = 0; i < nbasis; ++i)
+        for(int j = 0; j < nbasis; ++j)
+          selfLinkInv(i, j) = selfLinkInvEigen(i, j);
+
+      pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
+    });
+  }
+
+  void FillHalfCbs() {
+    std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
+    for(int p = 0; p < geom.npoint; ++p) {
+      pickCheckerboard(Even, Aeven[p], A[p]);
+      pickCheckerboard(Odd, Aodd[p], A[p]);
    }
+    pickCheckerboard(Even, AselfInvEven, AselfInv);
+    pickCheckerboard(Odd, AselfInvOdd, AselfInv);
  }
 };

--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -1,67 +0,0 @@
-#include <Grid/GridCore.h>
-#include <fcntl.h>
-
-NAMESPACE_BEGIN(Grid);
-
-MemoryStats *MemoryProfiler::stats = nullptr;
-bool         MemoryProfiler::debug = false;
-
-void check_huge_pages(void *Buf,uint64_t BYTES)
-{
-#ifdef __linux__
-  int fd = open("/proc/self/pagemap", O_RDONLY);
-  assert(fd >= 0);
-  const int page_size = 4096;
-  uint64_t virt_pfn = (uint64_t)Buf / page_size;
-  off_t offset = sizeof(uint64_t) * virt_pfn;
-  uint64_t npages = (BYTES + page_size-1) / page_size;
-  uint64_t pagedata[npages];
-  uint64_t ret = lseek(fd, offset, SEEK_SET);
-  assert(ret == offset);
-  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
-  assert(ret == sizeof(uint64_t) * npages);
-  int nhugepages = npages / 512;
-  int n4ktotal, nnothuge;
-  n4ktotal = 0;
-  nnothuge = 0;
-  for (int i = 0; i < nhugepages; ++i) {
-    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
-    for (int j = 0; j < 512; ++j) {
-      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
-      ++n4ktotal;
-      if (pageaddr != baseaddr + j * page_size)
-	++nnothuge;
-    }
-  }
-  int rank = CartesianCommunicator::RankWorld();
-  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
-#endif
-}
-
-std::string sizeString(const size_t bytes)
-{
-  constexpr unsigned int bufSize = 256;
-  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
-  char                   buf[256];
-  size_t                 s     = 0;
-  double                 count = bytes;
-  
-  while (count >= 1024 && s < 7)
-    {
-      s++;
-      count /= 1024;
-    }
-  if (count - floor(count) == 0.0)
-    {
-      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
-    }
-  else
-    {
-      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
-    }
-  
-  return std::string(buf);
-}
-
-NAMESPACE_END(Grid);
-
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -165,9 +165,18 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-//template<class T> using commAllocator = devAllocator<T>;
-template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
+#ifdef ACCELERATOR_CSHIFT
+// Cshift on device
+template<class T> using cshiftAllocator = devAllocator<T>;
+#else
+// Cshift on host
+template<class T> using cshiftAllocator = std::allocator<T>;
+#endif
+
+template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
+template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,devAllocator<T> >;
+template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);

 // Move control to configure.ac and Config.h?

-#define ALLOCATION_CACHE
-#define GRID_ALLOC_ALIGN (2*1024*1024)
 #define GRID_ALLOC_SMALL_LIMIT (4096)

 /*Pinning pages is costly*/
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -1,11 +1,12 @@
 #include <Grid/GridCore.h>
-
 #ifndef GRID_UVM

 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
+//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)

+
 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
 ////////////////////////////////////////////////////////////
@@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  //  dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    //    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  //  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
@@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    //    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  //  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  //  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // Find if present, otherwise get or force an empty
  ////////////////////////////////////////////////////////////////////////////
  if ( EntryPresent(CpuPtr)==0 ){
-    EvictVictims(bytes);
    EntryCreate(CpuPtr,bytes,mode,hint);
  }

  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
-  
+  if (!AccCache.AccPtr) {
+    EvictVictims(bytes); 
+  } 
  assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));

  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
+    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
+		    (uint64_t)AccCache.CpuPtr,
+		    (uint64_t)CpuPtr,
+		    (uint64_t)AccCache.bytes,
+		    (uint64_t)bytes);
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
@@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    //    printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    //    printf("Consistent entry into device accLock %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    //    printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
  } else {
    assert(0);
  }
@@ -361,13 +368,16 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
  // Find if present, otherwise get or force an empty
  ////////////////////////////////////////////////////////////////////////////
  if ( EntryPresent(CpuPtr)==0 ){
-    EvictVictims(bytes);
    EntryCreate(CpuPtr,bytes,mode,transient);
  }

  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
-  
+
+  if (!AccCache.AccPtr) {
+     EvictVictims(bytes);
+  }
+
  assert((mode==CpuRead)||(mode==CpuWrite));
  assert(AccCache.accLock==0);  // Programming error

--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -1,7 +1,6 @@
 #include <Grid/GridCore.h>
 #ifdef GRID_UVM

-#warning "Grid is assuming unified virtual memory address space"
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////
 // View management is 1:1 address space mapping
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {

-#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
+#ifndef GRID_COMMS_THREADS
    nCommThreads=1;
    // wrong results here too
    // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
+  int tag;

  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
+    tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }

  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
+    tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -102,7 +102,7 @@ public:
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
-  static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
+  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
  static void SharedMemoryZero(void *dest,size_t bytes);

 };
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
-  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
+  //  if ( WorldRank == 0 ){
+  if ( 1 ){
+    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
@@ -665,7 +666,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      
-      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
@@ -715,7 +715,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
  bzero(dest,bytes);
 #endif
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
 {
 #ifdef GRID_CUDA
  cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
@@ -771,19 +771,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 

-#ifdef GRID_IBM_SUMMIT
-  // Hide the shared memory path between sockets 
-  // if even number of nodes
-  if ( (ShmSize & 0x1)==0 ) {
-    int SocketSize = ShmSize/2;
-    int mySocket = ShmRank/SocketSize; 
+#ifdef GRID_SHM_FORCE_MPI
+  // Hide the shared memory path between ranks
+  {
    for(int r=0;r<size;r++){
-      int hisRank=ShmRanks[r];
-      if ( hisRank!= MPI_UNDEFINED ) {
-	int hisSocket=hisRank/SocketSize;
-	if ( hisSocket != mySocket ) {
-	  ShmRanks[r] = MPI_UNDEFINED;
-	}
+      if ( r!=rank ) {
+	ShmRanks[r] = MPI_UNDEFINED;
      }
    }
  }
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>

 NAMESPACE_BEGIN(Grid); 
+#define header "SharedMemoryNone: "

 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended, use anonymous mmap
 ////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
+  void * ShmCommBuf ; 
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Each MPI rank should allocate our own buffer
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  ShmCommBuf = acceleratorAllocDevice(bytes);
+
+  if (ShmCommBuf == (void *)NULL ) {
+    std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    exit(EXIT_FAILURE);  
+  }
+  if ( WorldRank == 0 ){
+    std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes 
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+  }
+  SharedMemoryZero(ShmCommBuf,bytes);
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Loop over ranks/gpu's on our node
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+  WorldShmCommBufs[0] = ShmCommBuf;
+
+  _ShmAllocBytes=bytes;
+  _ShmAlloc=1;
+}
+#else
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 };
-
+#endif
+void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
+{
+  acceleratorMemSet(dest,0,bytes);
+}
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+{
+  acceleratorCopyToDevice(src,dest,bytes);
+}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
     }
  }
  {
-    autoView(rhs_v , rhs, AcceleratorRead);
    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for(i,ent,{
+      buffer_p[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
  }
 }

@@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];

  if ( cbmask ==0x3){
+#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for2d(n,e1,b,e2,1,{
 	int o      =   n*n1;
@@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	
+	vobj temp =rhs_v[so+o+b];
+	extract<vobj>(temp,pointers,offset);
+      });
+#endif
  } else { 
-    autoView(rhs_v , rhs, AcceleratorRead);
-
    Coordinate rdim=rhs.Grid()->_rdimensions;
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
+#ifdef ACCELERATOR_CSHIFT    
+    autoView(rhs_v , rhs, AcceleratorRead);
    accelerator_for2d(n,e1,b,e2,1,{

 	Coordinate coor;
@@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
      });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+
+	Coordinate coor;
+
+	int o=n*n1;
+	int oindex = o+b;
+
+       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
+
+	int ocb=1<<cb;
+	int offset = b+n*e2;
+
+	if ( ocb & cbmask ) {
+	  vobj temp =rhs_v[so+o+b];
+	  extract<vobj>(temp,pointers,offset);
+	}
+      });
+#endif
  }
 }

 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];

@@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
  }
  
  {
-    autoView( rhs_v, rhs, AcceleratorWrite);
    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView( rhs_v, rhs, AcceleratorWrite);
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
+#else
+    autoView( rhs_v, rhs, CpuWrite);
+    thread_for(i,ent,{
+      rhs_v[table[i].first]=buffer_p[table[i].second];
+    });
+#endif
  }
 }

@@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  int e2=rhs.Grid()->_slice_block[dimension];

  if(cbmask ==0x3 ) {
-    autoView( rhs_v , rhs, AcceleratorWrite);
    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
    int _slice_block = rhs.Grid()->_slice_block[dimension];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView( rhs_v , rhs, AcceleratorWrite);
    accelerator_for2d(n,e1,b,e2,1,{
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      });
+#else
+    autoView( rhs_v , rhs, CpuWrite);
+    thread_for2d(n,e1,b,e2,{
+	int o      = n*_slice_stride;
+	int offset = b+n*_slice_block;
+	merge(rhs_v[so+o+b],pointers,offset);
+    });
+#endif
  } else { 

    // Case of SIMD split AND checker dim cannot currently be hit, except in 
@@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  }

  {
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
    autoView(lhs_v , lhs, AcceleratorWrite);
-    auto table = &Cshift_table[0];
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    autoView(lhs_v , lhs, CpuWrite);
+    thread_for(i,ent,{
+      lhs_v[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
  }
 }

@@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  }

  {
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v, rhs, AcceleratorRead);
    autoView( lhs_v, lhs, AcceleratorWrite);
-    auto table = &Cshift_table[0];
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
+#else
+    autoView( rhs_v, rhs, CpuRead);
+    autoView( lhs_v, lhs, CpuWrite);
+    thread_for(i,ent,{
+      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
+    });
+#endif
  }
 }

--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
 }
-
+#define ACCELERATOR_CSHIFT_NO_COPY
+#ifdef ACCELERATOR_CSHIFT_NO_COPY
 template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
@@ -121,9 +122,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  commVector<vobj> send_buf(buffer_size);
-  commVector<vobj> recv_buf(buffer_size);
-
+  cshiftVector<vobj> send_buf(buffer_size);
+  cshiftVector<vobj> recv_buf(buffer_size);
+    
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);

@@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

    } else {

-      int words = send_buf.size();
+      int words = buffer_size;
      if (cbmask != 0x3) words=words>>1;

      int bytes = words * sizeof(vobj);
@@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);

+      grid->Barrier();

      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+
      grid->Barrier();

      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -195,8 +198,15 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
-  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
+  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  scalar_object *  recv_buf_extract_mpi;
+  scalar_object *  send_buf_extract_mpi;
+ 
+  for(int s=0;s<Nsimd;s++){
+    send_buf_extract[s].resize(buffer_size);
+    recv_buf_extract[s].resize(buffer_size);
+  }

  int bytes = buffer_size*sizeof(scalar_object);

@@ -242,11 +252,204 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
      if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 

-	grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
+	grid->Barrier();
+
+	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
+	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
-			     (void *)&recv_buf_extract[i][0],
+			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
+
+	grid->Barrier();
+
+	rpointers[i] = &recv_buf_extract[i][0];
+      } else { 
+	rpointers[i] = &send_buf_extract[nbr_lane][0];
+      }
+
+    }
+    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
+  }
+
+}
+#else 
+template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  GridBase *grid=rhs.Grid();
+  Lattice<vobj> temp(rhs.Grid());
+
+  int fd              = rhs.Grid()->_fdimensions[dimension];
+  int rd              = rhs.Grid()->_rdimensions[dimension];
+  int pd              = rhs.Grid()->_processors[dimension];
+  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
+  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
+  assert(simd_layout==1);
+  assert(comm_dim==1);
+  assert(shift>=0);
+  assert(shift<fd);
+  
+  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
+  cshiftVector<vobj> send_buf_v(buffer_size);
+  cshiftVector<vobj> recv_buf_v(buffer_size);
+  vobj *send_buf;
+  vobj *recv_buf;
+  {
+    grid->ShmBufferFreeAll();
+    size_t bytes = buffer_size*sizeof(vobj);
+    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+  }
+    
+  int cb= (cbmask==0x2)? Odd : Even;
+  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  for(int x=0;x<rd;x++){       
+
+    int sx        =  (x+sshift)%rd;
+    int comm_proc = ((x+sshift)/rd)%pd;
+    
+    if (comm_proc==0) {
+
+      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+    } else {
+
+      int words = buffer_size;
+      if (cbmask != 0x3) words=words>>1;
+
+      int bytes = words * sizeof(vobj);
+
+      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
+
+      //      int rank           = grid->_processor;
+      int recv_from_rank;
+      int xmit_to_rank;
+      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+
+      grid->Barrier();
+
+      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&send_buf[0],
+			   xmit_to_rank,
+			   (void *)&recv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
+
+      grid->Barrier();
+
+      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
+    }
+  }
+}
+
+template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid=rhs.Grid();
+  const int Nsimd = grid->Nsimd();
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+   
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int pd = grid->_processors[dimension];
+  int simd_layout     = grid->_simd_layout[dimension];
+  int comm_dim        = grid->_processors[dimension] >1 ;
+
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
+  assert(comm_dim==1);
+  assert(simd_layout==2);
+  assert(shift>=0);
+  assert(shift<fd);
+
+  int permute_type=grid->PermuteType(dimension);
+
+  ///////////////////////////////////////////////
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+  //  int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
+  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  scalar_object *  recv_buf_extract_mpi;
+  scalar_object *  send_buf_extract_mpi;
+  {
+    size_t bytes = sizeof(scalar_object)*buffer_size;
+    grid->ShmBufferFreeAll();
+    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+  }
+  for(int s=0;s<Nsimd;s++){
+    send_buf_extract[s].resize(buffer_size);
+    recv_buf_extract[s].resize(buffer_size);
+  }
+
+  int bytes = buffer_size*sizeof(scalar_object);
+
+  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
+  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
+
+  ///////////////////////////////////////////
+  // Work out what to send where
+  ///////////////////////////////////////////
+  int cb    = (cbmask==0x2)? Odd : Even;
+  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  // loop over outer coord planes orthog to dim
+  for(int x=0;x<rd;x++){       
+
+    // FIXME call local permute copy if none are offnode.
+    for(int i=0;i<Nsimd;i++){       
+      pointers[i] = &send_buf_extract[i][0];
+    }
+    int sx   = (x+sshift)%rd;
+    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+
+    for(int i=0;i<Nsimd;i++){
+      
+      int inner_bit = (Nsimd>>(permute_type+1));
+      int ic= (i&inner_bit)? 1:0;
+
+      int my_coor          = rd*ic + x;
+      int nbr_coor         = my_coor+sshift;
+      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+
+      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
+      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
+      int nbr_lane = (i&(~inner_bit));
+
+      int recv_from_rank;
+      int xmit_to_rank;
+
+      if (nbr_ic) nbr_lane|=inner_bit;
+
+      assert (sx == nbr_ox);
+
+      if(nbr_proc){
+	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+
+	grid->Barrier();
+
+	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
+	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
+			     xmit_to_rank,
+			     (void *)recv_buf_extract_mpi,
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
+
 	grid->Barrier();
 	rpointers[i] = &recv_buf_extract[i][0];
      } else { 
@@ -258,7 +461,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  }

 }
-
+#endif
 NAMESPACE_END(Grid); 

 #endif
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-//#include <Grid/lattice/Lattice_reality.h>
+#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -342,19 +342,14 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)

 GridUnopClass(UnarySub, -a);
 GridUnopClass(UnaryNot, Not(a));
-GridUnopClass(UnaryAdj, adj(a));
-GridUnopClass(UnaryConj, conjugate(a));
 GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
-GridUnopClass(UnaryToReal, toReal(a));
-GridUnopClass(UnaryToComplex, toComplex(a));
 GridUnopClass(UnaryTimesI, timesI(a));
 GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
 GridUnopClass(UnaryAbs, abs(a));
 GridUnopClass(UnarySqrt, sqrt(a));
-GridUnopClass(UnaryRsqrt, rsqrt(a));
 GridUnopClass(UnarySin, sin(a));
 GridUnopClass(UnaryCos, cos(a));
 GridUnopClass(UnaryAsin, asin(a));
@@ -456,20 +451,17 @@ GridTrinOpClass(TrinaryWhere,
 GRID_DEF_UNOP(operator-, UnarySub);
 GRID_DEF_UNOP(Not, UnaryNot);
 GRID_DEF_UNOP(operator!, UnaryNot);
-GRID_DEF_UNOP(adj, UnaryAdj);
-GRID_DEF_UNOP(conjugate, UnaryConj);
+//GRID_DEF_UNOP(adj, UnaryAdj);
+//GRID_DEF_UNOP(conjugate, UnaryConj);
 GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
-GRID_DEF_UNOP(toReal, UnaryToReal);
-GRID_DEF_UNOP(toComplex, UnaryToComplex);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
 GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
 GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
                               // abs-fabs-dabs-labs thing
 GRID_DEF_UNOP(sqrt, UnarySqrt);
-GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
 GRID_DEF_UNOP(sin, UnarySin);
 GRID_DEF_UNOP(cos, UnaryCos);
 GRID_DEF_UNOP(asin, UnaryAsin);
@@ -494,27 +486,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 template <class Op, class T1>
 auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> 
+  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > 
 {
-  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
+  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2>
 auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> 
+  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
 {
-  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
+  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2, class T3>
 auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
+  -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
 				   vecEval(0, expr.arg2),
-				   vecEval(0, expr.arg3)))> 
+				   vecEval(0, expr.arg3)))>::type >
 {
-  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
+  Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
 				vecEval(0, expr.arg2),
-			        vecEval(0, expr.arg3)))>  ret(expr);
+			        vecEval(0, expr.arg3)))>::type >  ret(expr);
  return ret;
 }
 #define EXPRESSION_CLOSURE(function)					\
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }

-#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
+#if ( (!defined(GRID_CUDA)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
@@ -161,11 +161,13 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);

+  auto basis_vp=& basis_v[0];
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
-    auto B=coalescedRead(zz);
+    vobj zzz=Zero();
+    auto B=coalescedRead(zzz);
    for(int k=k0; k<k1; ++k){
-      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
+      B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
    }
    coalescedWrite(result_v[ss], B);
  });
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
  autoView( ret_v, ret, AcceleratorWrite);

  ret.Checkerboard()=lhs.Checkerboard();
-  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
-    coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
+  accelerator_for( ss, lhs_v.size(), 1, {
+     ret_v[ss] = adj(lhs_v[ss]);
  });
  return ret;
 };
@@ -64,6 +64,53 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
  return ret;
 };

+template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
+  Lattice<typename vobj::Complexified> ret(lhs.Grid());
+
+  autoView( lhs_v, lhs, AcceleratorRead);
+  autoView( ret_v, ret, AcceleratorWrite);
+
+  ret.Checkerboard() = lhs.Checkerboard();
+  accelerator_for( ss, lhs_v.size(), 1, {
+    ret_v[ss] = toComplex(lhs_v[ss]);
+  });
+  return ret;
+};
+template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
+  Lattice<typename vobj::Realified> ret(lhs.Grid());
+
+  autoView( lhs_v, lhs, AcceleratorRead);
+  autoView( ret_v, ret, AcceleratorWrite);
+
+  ret.Checkerboard() = lhs.Checkerboard();
+  accelerator_for( ss, lhs_v.size(), 1, {
+    ret_v[ss] = toReal(lhs_v[ss]);
+  });
+  return ret;
+};
+
+
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto toComplex(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return toComplex(closure(expr));
+}
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto toReal(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return toReal(closure(expr));
+}
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto adj(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return adj(closure(expr));
+}
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
+auto conjugate(const Expression &expr)  -> decltype(closure(expr)) 
+{
+  return conjugate(closure(expr));
+}
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -127,6 +127,11 @@ accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
  convertType(out,in._internal);
 }

+template<typename T1, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
+accelerator_inline void convertType(T1 & out, const iScalar<T1> & in) {
+  convertType(out,in._internal);
+}
+
 template<typename T1,typename T2>
 accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
  convertType(out._internal,in);
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -1,3 +1,4 @@
 #include <Grid/GridCore.h>

-int Grid::BinaryIO::latticeWriteMaxRetry = -1;
+int                    Grid::BinaryIO::latticeWriteMaxRetry = -1;
+Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
+  struct IoPerf
+  {
+    uint64_t size{0},time{0};
+    double   mbytesPerSecond{0.};
+  };
+
+  static IoPerf lastPerf;
  static int latticeWriteMaxRetry;

  /////////////////////////////////////////////////////////////////////////////
@@ -502,12 +509,15 @@ class BinaryIO {
      timer.Stop();
    }
    
+    lastPerf.size            = sizeof(fobj)*iodata.size()*nrank;
+    lastPerf.time            = timer.useconds();
+    lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
+    std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
+	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;

    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;

@@ -663,10 +673,15 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);

    timer.Start();
-    thread_for(lidx,lsites,{
+    thread_for(lidx,lsites,{  // FIX ME, suboptimal implementation
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
-      parallel_rng.SetState(tmp,lidx);
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(lidx, lcoor);
+      int o_idx=grid->oIndex(lcoor);
+      int i_idx=grid->iIndex(lcoor);
+      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
+      parallel_rng.SetState(tmp,gidx);
      });
    timer.Stop();

@@ -723,7 +738,12 @@ class BinaryIO {
    std::vector<RNGstate> iodata(lsites);
    thread_for(lidx,lsites,{
      std::vector<RngStateType> tmp(RngStateCount);
-      parallel_rng.GetState(tmp,lidx);
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(lidx, lcoor);
+      int o_idx=grid->oIndex(lcoor);
+      int i_idx=grid->iIndex(lcoor);
+      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
+      parallel_rng.GetState(tmp,gidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
    });
    timer.Stop();
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
 ////////////////////////////////////////////////////////////
 // Helper to fill out metadata
 ////////////////////////////////////////////////////////////
- template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
 					  scidacFile   & _scidacFile) 
@@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
+  template <class stats = PeriodicGaugeStatistics>
+  void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description) 
  {
    GridBase * grid = Umu.Grid();
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef Lattice<vLorentzColourMatrixD> GaugeField;
+    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    ////////////////////////////////////////
@@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter {

    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);

+    stats Stats;
+    Stats(Umu,header);
+    
    std::string format = header.floating_point;
    header.ensemble_id    = description;
    header.ensemble_label = description;
@@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader {
  // Else use ILDG MetaData object if present.
  // Else use SciDAC MetaData object if present.
  ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
+  template <class stats = PeriodicGaugeStatistics>
+  void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {

-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+    typedef Lattice<vLorentzColourMatrixD > GaugeField;
    typedef typename GaugeField::vector_object  vobj;
    typedef typename vobj::scalar_object sobj;

@@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader {

    if ( found_FieldMetaData || found_usqcdInfo ) {
      FieldMetaData checker;
-      GaugeStatistics(Umu,checker);
+      stats Stats;
+      Stats(Umu,checker);
      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
  GridMetaData(grid,header); 
  MachineCharacteristics(header);
 }
-inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
+template<class Impl>
+class GaugeStatistics
 {
-  // How to convert data precision etc...
-  header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
-  header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
-}
-inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
-{
-  // How to convert data precision etc...
-  header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
-  header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
-}
-template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
-{
-   
-  GridBase *grid = field.Grid();
-  std::string format = getFormatString<vLorentzColourMatrixF>();
-  header.floating_point = format;
-  header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
-  GridMetaData(grid,header); 
-  GaugeStatistics(field,header);
-  MachineCharacteristics(header);
-}
+public:
+  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
+  {
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+  }
+};
+typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
+typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
 template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
 {
  GridBase *grid = field.Grid();
@@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
  header.floating_point = format;
  header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
  GridMetaData(grid,header); 
-  GaugeStatistics(field,header);
  MachineCharacteristics(header);
 }

--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -40,6 +40,8 @@ using namespace Grid;
 class NerscIO : public BinaryIO { 
 public:

+  typedef Lattice<vLorentzColourMatrixD> GaugeField;
+
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -129,12 +131,12 @@ public:
  // Now the meat: the object readers
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

-  template<class vsimd>
-  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void readConfiguration(GaugeField &Umu,
 				       FieldMetaData& header,
-				       std::string file)
+				       std::string file,
+				       GaugeStats GaugeStatisticsCalculator=GaugeStats())
  {
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;

    GridBase *grid = Umu.Grid();
    uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -153,23 +155,23 @@ public:
    // munger is a function of <floating point, Real, data_type>
    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
      if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
      if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
@@ -177,7 +179,7 @@ public:
      assert(0);
    }

-    GaugeStatistics(Umu,clone);
+    GaugeStats Stats; Stats(Umu,clone);

    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	     <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -203,15 +205,13 @@ public:
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }

-  template<class vsimd>
-  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32)
  {
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-
-    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    FieldMetaData header;
@@ -229,7 +229,7 @@ public:

    GridMetaData(grid,header);
    assert(header.nd==4);
-    GaugeStatistics(Umu,header);
+    GaugeStats Stats; Stats(Umu,header);
    MachineCharacteristics(header);

 	uint64_t offset;
@@ -238,19 +238,19 @@ public:
    header.floating_point = std::string("IEEE64BIG");
    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
    GaugeSimpleUnmunger<fobj3D,sobj> munge;
-	if ( grid->IsBoss() ) { 
-	  truncate(file);
-    offset = writeHeader(header,file);
-	}
-	grid->Broadcast(0,(void *)&offset,sizeof(offset));
+    if ( grid->IsBoss() ) { 
+      truncate(file);
+      offset = writeHeader(header,file);
+    }
+    grid->Broadcast(0,(void *)&offset,sizeof(offset));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 					      nersc_csum,scidac_csuma,scidac_csumb);
    header.checksum = nersc_csum;
-	if ( grid->IsBoss() ) { 
-    writeHeader(header,file);
-	}
+    if ( grid->IsBoss() ) { 
+      writeHeader(header,file);
+    }

    std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 	     <<std::hex<<header.checksum
--- a/Grid/parallelIO/OpenQcdIO.h
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -154,7 +154,7 @@ public:
    grid->Barrier(); timer.Stop();
    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;

-    GaugeStatistics(Umu, clone);
+    PeriodicGaugeStatistics Stats; Stats(Umu, clone);

    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

--- a/Grid/parallelIO/OpenQcdIOChromaReference.h
+++ b/Grid/parallelIO/OpenQcdIOChromaReference.h
@@ -208,7 +208,7 @@ public:

    FieldMetaData clone(header);

-    GaugeStatistics(Umu, clone);
+    PeriodicGaugeStatistics Stats; Stats(Umu, clone);

    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
 static constexpr int Zm = 6;
 static constexpr int Tm = 7;

-static constexpr int Nc=3;
+static constexpr int Nc=Config_Nc;
 static constexpr int Ns=4;
 static constexpr int Nd=4;
 static constexpr int Nhs=2; // half spinor
@@ -80,6 +80,13 @@ template<typename T> struct isSpinor {
 template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
 template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

+const int CoarseIndex = 4;
+template<typename T> struct isCoarsened {
+   static constexpr bool value = (CoarseIndex<=T::TensorLevel);
+};
+template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
+template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -97,42 +97,30 @@ public:
    Coordinate icoor;

 #ifdef GRID_SIMT
-    _Spinor tmp;
-
    const int Nsimd =SiteDoubledGaugeField::Nsimd();
    int s = acceleratorSIMTlane(Nsimd);
    St.iCoorFromIindex(icoor,s);

    int mmu = mu % Nd;
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-      
-      int permute_lane = (sl==1) 
-    	|| ((distance== 1)&&(icoor[direction]==1))
-	|| ((distance==-1)&&(icoor[direction]==0));

-      if ( permute_lane ) { 
-	tmp(0) = chi(1);
-	tmp(1) = chi(0);
-      } else {
-	tmp(0) = chi(0);
-	tmp(1) = chi(1);
-      }
+    auto UU0=coalescedRead(U(0)(mu));
+    auto UU1=coalescedRead(U(1)(mu));
+    
+    //Decide whether we do a G-parity flavor twist
+    //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
+    //It also assumes (but does not check) that abs(distance) == 1
+    int permute_lane = (sl==1) 
+    || ((distance== 1)&&(icoor[direction]==1))
+    || ((distance==-1)&&(icoor[direction]==0));

-      auto UU0=coalescedRead(U(0)(mu));
-      auto UU1=coalescedRead(U(1)(mu));
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world

-      mult(&phi(0),&UU0,&tmp(0));
-      mult(&phi(1),&UU1,&tmp(1));
+    //Apply the links
+    int f_upper = permute_lane ? 1 : 0;
+    int f_lower = !f_upper;

-    } else {
-
-      auto UU0=coalescedRead(U(0)(mu));
-      auto UU1=coalescedRead(U(1)(mu));
-
-      mult(&phi(0),&UU0,&chi(0));
-      mult(&phi(1),&UU1,&chi(1));
-
-    }
+    mult(&phi(0),&UU0,&chi(f_upper));
+    mult(&phi(1),&UU1,&chi(f_lower));

 #else
    typedef _Spinor vobj;
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif

-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -92,20 +92,16 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;

-  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-
-  Coordinate lcoor;
-  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
-
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
-    for (int site = 0; site < lvol; site++) {
+    thread_for(site, lvol, {
+      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
-      EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
-      Qxinv = Zero();
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
@@ -126,21 +122,21 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
-    }
+    });
  }

  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);

-  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
-  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));

  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);

-  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
 }

 template <class Impl>
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
@@ -38,9 +38,6 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 // undefine everything related to kernels
 #include <simd/Fujitsu_A64FX_undef.h>

-// enable A64FX body
-#define WILSONKERNELSASMBODYA64FX
-//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")

    ///////////////////////////////////////////////////////////
    // If we are A64FX specialise the single precision routine
@@ -63,119 +60,89 @@ Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+


 /////////////////////////////////////////////////////////////////
@@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+


 // undefine
@@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, double
@@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+

 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
+
+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif

+#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
-#if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
-#else
-#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
-#endif
+



 // undefs
-#undef WILSONKERNELSASMBODYA64FX
 #include <simd/Fujitsu_A64FX_undef.h>

 #endif //A64FXASM
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h
@@ -25,6 +25,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+
+// GCC 10 messes up SVE instruction scheduling using -O3, but
+// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
+// performance now is better than armclang 20.2
+
 #ifdef KERNEL_DAG
 #define DIR0_PROJ    XP_PROJ
 #define DIR1_PROJ    YP_PROJ
@@ -97,7 +102,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    PROJ;							                        \
    MAYBEPERM(PERMUTE_DIR,perm);					        \
      } else {								                \
-	LOAD_CHI(base);							                \
+	  LOAD_CHI(base);							                \
      }									                    \
      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
    MULT_2SPIN_1(Dir);					                    \
@@ -110,6 +115,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    }                                                       \
    RECON;								                    \

+/*
+NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
+    though I expected that it would improve on performance
+*/
+
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
  PREFETCH1_CHIMU(base);						            \
@@ -126,73 +136,63 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University

 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
      basep = st.GetPFInfo(nent,plocal); nent++;			\
-      if ( local ) {							            \
-    LOAD_CHIMU(base);                                       \
-    LOAD_TABLE(PERMUTE_DIR);                                \
-    PROJ;							                        \
-    MAYBEPERM(PERMUTE_DIR,perm);					        \
-      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}	    \
-      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
-      if ( local || st.same_node[Dir] ) {				    \
-    MULT_2SPIN_1(Dir);					                    \
-    PREFETCH_CHIMU(base);                                   \
-    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
-    MULT_2SPIN_2;					                        \
-    if (s == 0) {                                           \
-       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
-    }                                                       \
-    RECON;								                    \
-    PREFETCH_CHIMU_L2(basep);                               \
-      } else { PREFETCH_CHIMU(base); }								                    \
+      if ( local ) {							\
+  LOAD_CHIMU(base);                                       \
+  LOAD_TABLE(PERMUTE_DIR);                                \
+  PROJ;							                        \
+  MAYBEPERM(PERMUTE_DIR,perm);					        \
+      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}			\
+      if ( local || st.same_node[Dir] ) {				\
+  MULT_2SPIN_1(Dir);					                    \
+  MULT_2SPIN_2;					                        \
+  RECON;								\
+      }									\
+  base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
+  PREFETCH_CHIMU(base);						\
+  PREFETCH_CHIMU_L2(basep);                               \

 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
  PREFETCH1_CHIMU(base);						\
+  { ZERO_PSI; }								\
  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)

 #define RESULT(base,basep) SAVE_RESULT(base,basep);

 #endif
+
 ////////////////////////////////////////////////////////////////////////////////
 // Post comms kernel
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef EXTERIOR

-
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
-  if((!local)&&(!st.same_node[Dir]) ) {					    \
-    LOAD_CHI(base);							                \
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
    MULT_2SPIN_1(Dir);					                    \
-    PREFETCH_CHIMU(base);                                   \
-    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
    MULT_2SPIN_2;					                        \
-    if (s == 0) {                                           \
-      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
-    }                                                       \
-    RECON;								                    \
-    nmu++;								                    \
+    RECON;								\
+    nmu++;								\
  }

-#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
-  nmu=0;								                    \
-  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
-  if((!local)&&(!st.same_node[Dir]) ) {					    \
-    LOAD_CHI(base);							                \
+#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
+  nmu=0;								\
+  { ZERO_PSI;}								\
+  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
+  if((!local)&&(!st.same_node[Dir]) ) {					\
+    LOAD_CHI(base);							\
    MULT_2SPIN_1(Dir);					                    \
-    PREFETCH_CHIMU(base);                                   \
-    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
    MULT_2SPIN_2;					                        \
-    if (s == 0) {                                           \
-      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
-    }                                                       \
-    RECON;								                    \
-    nmu++;								                    \
+    RECON;								\
+    nmu++;								\
  }

 #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}

 #endif
+
+
 {
  int nmu;
  int local,perm, ptype;
@@ -209,7 +209,6 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
    //    int sUn=lo.Reorder(ssn);
    int sUn=ssn;
-    LOCK_GAUGE(0);
 #else
    int sU =ssU;
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
@@ -295,6 +294,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
      std::cout << "----------------------------------------------------" << std::endl;
 #endif

+      // DC ZVA test
+      // { uint64_t basestore = (uint64_t)&out[ss];
+      //   PREFETCH_RESULT_L2_STORE(basestore); }
+
+
      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);

 #ifdef SHOW
@@ -308,6 +312,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
      std::cout << "----------------------------------------------------" << std::endl;
 #endif

+      // DC ZVA test
+      //{ uint64_t basestore = (uint64_t)&out[ss];
+      //  PREFETCH_RESULT_L2_STORE(basestore); }
+
+
      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);

 #ifdef SHOW
@@ -321,6 +330,11 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
      std::cout << "----------------------------------------------------" << std::endl;
 #endif

+      // DC ZVA test
+      //{ uint64_t basestore = (uint64_t)&out[ss];
+      //  PREFETCH_RESULT_L2_STORE(basestore); }
+
+
      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);

 #ifdef SHOW
@@ -341,6 +355,7 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
      base = (uint64_t) &out[ss];
      basep= st.GetPFInfo(nent,plocal); ent++;
      basep = (uint64_t) &out[ssn];
+      //PREFETCH_RESULT_L1_STORE(base);
      RESULT(base,basep);

 #ifdef SHOW
--- a/Grid/qcd/action/gauge/Gauge.cc
+++ b/Grid/qcd/action/gauge/Gauge.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/Gauge.cc
+
+Copyright (C) 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+std::vector<int> ConjugateGaugeImplBase::_conjDirs;
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@@ -154,6 +154,10 @@ public:
    return Hsum.real();
  }

+  static inline void Project(Field &U) {
+    ProjectSUn(U);
+  }
+
  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
    SU<Nc>::HotConfiguration(pRNG, U);
  }
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -59,14 +59,14 @@ public:
  }
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    return Cshift(adj(Link), mu, -1);
+    return PeriodicBC::CovShiftIdentityBackward(Link, mu);
  }
  static inline GaugeLinkField
  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-    return Link;
+    return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-    return Cshift(Link, mu, 1);
+    return PeriodicBC::ShiftStaple(Link,mu);
  }

  static inline bool isPeriodicGaugeField(void) { return true; }
@@ -74,7 +74,13 @@ public:

 // Composition with smeared link, bc's etc.. probably need multiple inheritance
 // Variable precision "S" and variable Nc
-template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
+class ConjugateGaugeImplBase {
+protected:
+  static std::vector<int> _conjDirs;
+};
+
+  template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase {
+private:
 public:
  INHERIT_GIMPL_TYPES(GimplTypes);

@@ -84,47 +90,56 @@ public:
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  template <class covariant>
  static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
-                                            const Lattice<covariant> &field) {
-    return ConjugateBC::CovShiftForward(Link, mu, field);
+                                            const Lattice<covariant> &field)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftForward(Link, mu, field);
+    else
+      return PeriodicBC::CovShiftForward(Link, mu, field);
  }

  template <class covariant>
  static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
-                                             const Lattice<covariant> &field) {
-    return ConjugateBC::CovShiftBackward(Link, mu, field);
+                                             const Lattice<covariant> &field)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftBackward(Link, mu, field);
+    else 
+      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }

  static inline GaugeLinkField
-  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    GridBase *grid = Link.Grid();
-    int Lmu = grid->GlobalDimensions()[mu] - 1;
-
-    Lattice<iScalar<vInteger>> coor(grid);
-    LatticeCoordinate(coor, mu);
-
-    GaugeLinkField tmp(grid);
-    tmp = adj(Link);
-    tmp = where(coor == Lmu, conjugate(tmp), tmp);
-    return Cshift(tmp, mu, -1); // moves towards positive mu
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftIdentityBackward(Link, mu);
+    else 
+      return PeriodicBC::CovShiftIdentityBackward(Link, mu);
  }
  static inline GaugeLinkField
-  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-    return Link;
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftIdentityForward(Link,mu);
+    else
+      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }

-  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-    GridBase *grid = Link.Grid();
-    int Lmu = grid->GlobalDimensions()[mu] - 1;
-
-    Lattice<iScalar<vInteger>> coor(grid);
-    LatticeCoordinate(coor, mu);
-
-    GaugeLinkField tmp(grid);
-    tmp = Cshift(Link, mu, 1);
-    tmp = where(coor == Lmu, conjugate(tmp), tmp);
-    return tmp;
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::ShiftStaple(Link,mu);
+    else     
+      return PeriodicBC::ShiftStaple(Link,mu);
  }

+  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };

--- a/Grid/qcd/action/scalar/ScalarImpl.h
+++ b/Grid/qcd/action/scalar/ScalarImpl.h
@@ -54,6 +54,10 @@ public:
  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
    U = 1.0;
  }
+
+  static inline void Project(Field &U) {
+    return;
+  }
    
  static void MomentumSpacePropagator(Field &out, RealD m)
  {
@@ -234,6 +238,10 @@ public:
 #endif //USE_FFT_ACCELERATION
  }

+  static inline void Project(Field &U) {
+    return;
+  }
+
  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
    Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
  }
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -159,6 +159,13 @@ private:
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
+    } else {
+      // others
+      std::cout << GridLogError << "Unrecognized StartingType\n";
+      std::cout
+	<< GridLogError
+	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+      exit(1);
    }

    Smearing.set_Field(U);
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -95,7 +95,7 @@ private:

  typedef typename IntegratorType::Field Field;
  typedef std::vector< HmcObservable<Field> * > ObsListType;
-  
+
  //pass these from the resource manager
  GridSerialRNG &sRNG;   
  GridParallelRNG &pRNG; 
--- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@@ -74,7 +74,7 @@ public:
      conf_file = os.str();
    }
  } 
-
+  virtual ~BaseHmcCheckpointer(){};
  void check_filename(const std::string &filename){
    std::ifstream f(filename.c_str());
    if(!f.good()){
@@ -82,7 +82,6 @@ public:
      abort();
    };
  }
-
  virtual void initialize(const CheckpointerParameters &Params) = 0;

  virtual void CheckpointRestore(int traj, typename Impl::Field &U,
--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -45,6 +45,7 @@ private:

 public:
  INHERIT_GIMPL_TYPES(Implementation);
+  typedef GaugeStatistics<Implementation> GaugeStats;

  ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }

@@ -78,7 +79,7 @@ public:
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
      IldgWriter _IldgWriter(grid->IsBoss());
      _IldgWriter.open(config);
-      _IldgWriter.writeConfiguration(U, traj, config, config);
+      _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
      _IldgWriter.close();

      std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@@ -105,7 +106,7 @@ public:
    FieldMetaData header;
    IldgReader _IldgReader;
    _IldgReader.open(config);
-    _IldgReader.readConfiguration(U,header);  // format from the header
+    _IldgReader.readConfiguration<GaugeStats>(U,header);  // format from the header
    _IldgReader.close();

    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
--- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
@@ -43,7 +43,8 @@ private:

 public:
  INHERIT_GIMPL_TYPES(Gimpl);  // only for gauge configurations
-
+  typedef GaugeStatistics<Gimpl> GaugeStats;
+  
  NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }

  void initialize(const CheckpointerParameters &Params_) {
@@ -60,7 +61,7 @@ public:
      int precision32 = 1;
      int tworow = 0;
      NerscIO::writeRNGState(sRNG, pRNG, rng);
-      NerscIO::writeConfiguration(U, config, tworow, precision32);
+      NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
    }
  };

@@ -74,7 +75,7 @@ public:

    FieldMetaData header;
    NerscIO::readRNGState(sRNG, pRNG, header, rng);
-    NerscIO::readConfiguration(U, header, config);
+    NerscIO::readConfiguration<GaugeStats>(U, header, config);
  };
 };

--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -313,6 +313,8 @@ public:
      std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
    }

+    FieldImplementation::Project(U);
+
    // and that we indeed got to the end of the trajectory
    assert(fabs(t_U - Params.trajL) < 1.0e-6);

--- a/Grid/qcd/modules/Modules.h
+++ b/Grid/qcd/modules/Modules.h
@@ -99,7 +99,7 @@ public:
  virtual Prod* getPtr() = 0;

  // add a getReference? 
-  
+  virtual ~HMCModuleBase(){};
  virtual void print_parameters(){};  // default to nothing
 };

--- a/Grid/qcd/spin/TwoSpinor.h
+++ b/Grid/qcd/spin/TwoSpinor.h
@@ -128,7 +128,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0)-fspin(2);
  hspin(1)=fspin(1)-fspin(3);
 }
@@ -138,40 +137,50 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 *  0 0 -1  0
 *  0 0  0 -1
 */
-
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0);
  hspin(1)=fspin(1);
 }

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(2);
  hspin(1)=fspin(3);
 }
  
-//  template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=fspin(0);
  rfspin(1)=fspin(1);
  rfspin(2)=Zero();
  rfspin(3)=Zero();
 }
-//  template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=Zero();
  rfspin(1)=Zero();
  rfspin(2)=fspin(2);
  rfspin(3)=fspin(3);
 }

+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=fspin(s);
+    rfspin(s+hN)=Zero();
+  }
+}
+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=Zero();
+    rfspin(s+hN)=fspin(s+hN);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Reconstruction routines to move back again to four spin
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,7 +192,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(1));
@@ -191,7 +199,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesI(hspin(1));
@@ -199,7 +206,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(1));
@@ -207,7 +213,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(1));
@@ -221,7 +226,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)= hspin(1);
@@ -229,7 +233,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(1);
@@ -237,7 +240,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(1);
@@ -245,7 +247,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(1);
@@ -260,7 +261,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(0));
@@ -268,7 +268,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=     timesI(hspin(0));
@@ -276,7 +275,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(0));
@@ -284,7 +282,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(0));
@@ -298,7 +295,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=hspin(0);
@@ -306,7 +302,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(0);
@@ -314,7 +309,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(0);
@@ -322,7 +316,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(0);
@@ -336,7 +329,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
  fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
  fspin(2)=Zero();
@@ -344,7 +336,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=Zero();
  fspin(1)=Zero();
  fspin(2)=hspin(0)+hspin(0);
@@ -352,7 +343,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0)+hspin(0);
  fspin(1)+=hspin(1)+hspin(1);
 }
@@ -372,7 +362,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 //////////
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXp(hspin._internal[i],fspin._internal[i]);
  }
@@ -426,26 +415,21 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM
    }}
 }

-
-
 ////////
 // Xm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -455,19 +439,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -476,45 +457,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
-
 ////////
 // Yp
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -524,19 +497,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -545,66 +515,55 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // Ym
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -613,19 +572,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -638,66 +594,57 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


@@ -706,62 +653,53 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -774,41 +712,35 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -817,44 +749,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // Tm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -864,19 +789,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -885,44 +807,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // 5p
 ////////
-template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -931,19 +846,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatri

 template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -952,19 +864,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr

 template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -972,24 +881,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM
 }

 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1001,17 +904,17 @@ template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &
 // 5m
 ////////

-template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
  spProj5m(hspin._internal,fspin._internal);
 }
-template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
@@ -1021,40 +924,34 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatri

 template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1063,24 +960,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM


 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5m(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5m(hspin._internal[i][j],fspin._internal[i][j]);
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -51,7 +51,7 @@ public:

  private: 
  template <class mobj, class robj>
-  static void baryon_site(const mobj &D1,
+  static void BaryonSite(const mobj &D1,
 				 const mobj &D2,
 				 const mobj &D3,
 				 const Gamma GammaA_left,
@@ -61,8 +61,18 @@ public:
 				 const int parity,
 				 const bool * wick_contractions,
  				 robj &result);
+  template <class mobj, class robj>
+  static void BaryonSiteMatrix(const mobj &D1,
+         const mobj &D2,
+         const mobj &D3,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const bool * wick_contractions,
+           robj &result);
  public:
-  static void Wick_Contractions(std::string qi, 
+  static void WickContractions(std::string qi, 
                 std::string qf, 
                 bool* wick_contractions);
  static void ContractBaryons(const PropagatorField &q1_left,
@@ -75,8 +85,17 @@ public:
 				 const bool* wick_contractions,
 				 const int parity,
 				 ComplexField &baryon_corr);
+  static void ContractBaryonsMatrix(const PropagatorField &q1_left,
+         const PropagatorField &q2_left,
+         const PropagatorField &q3_left,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const bool* wick_contractions,
+         SpinMatrixField &baryon_corr);
  template <class mobj, class robj>
-  static void ContractBaryons_Sliced(const mobj &D1,
+  static void ContractBaryonsSliced(const mobj &D1,
 				 const mobj &D2,
 				 const mobj &D3,
 				 const Gamma GammaA_left,
@@ -87,9 +106,20 @@ public:
 				 const int parity,
 				 const int nt,
 				 robj &result);
+  template <class mobj, class robj>
+  static void ContractBaryonsSlicedMatrix(const mobj &D1,
+         const mobj &D2,
+         const mobj &D3,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const bool* wick_contractions,
+         const int nt,
+         robj &result);
  private:
  template <class mobj, class mobj2, class robj>
-  static void Baryon_Gamma_3pt_Group1_Site(
+  static void BaryonGamma3ptGroup1Site(
           const mobj &Dq1_ti,
           const mobj2 &Dq2_spec,
           const mobj2 &Dq3_spec,
@@ -101,7 +131,7 @@ public:
           robj &result);

  template <class mobj, class mobj2, class robj>
-  static void Baryon_Gamma_3pt_Group2_Site(
+  static void BaryonGamma3ptGroup2Site(
           const mobj2 &Dq1_spec,
           const mobj &Dq2_ti,
           const mobj2 &Dq3_spec,
@@ -113,7 +143,7 @@ public:
           robj &result);

  template <class mobj, class mobj2, class robj>
-  static void Baryon_Gamma_3pt_Group3_Site(
+  static void BaryonGamma3ptGroup3Site(
           const mobj2 &Dq1_spec,
           const mobj2 &Dq2_spec,
           const mobj &Dq3_ti,
@@ -125,7 +155,7 @@ public:
           robj &result);
  public:
  template <class mobj>
-  static void Baryon_Gamma_3pt(
+  static void BaryonGamma3pt(
           const PropagatorField &q_ti,
           const mobj &Dq_spec1,
           const mobj &Dq_spec2,
@@ -138,7 +168,7 @@ public:
           SpinMatrixField &stn_corr);
  private: 
  template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
+  static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -147,7 +177,7 @@ public:
 		                 		 const Gamma GammaB_nucl,
 						 robj &result);
  template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
+  static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -159,7 +189,7 @@ public:


  template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
+  static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -168,7 +198,7 @@ public:
 		                 		 const Gamma GammaB_nucl,
 						 robj &result);
  template <class mobj, class mobj2, class robj>
-  static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
+  static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -179,7 +209,7 @@ public:
 						 robj &result);
  public:
  template <class mobj>
-  static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
+  static void SigmaToNucleonEye(const PropagatorField &qq_loop,
 				 const mobj &Du_spec,
 				 const PropagatorField &qd_tf,
 				 const PropagatorField &qs_ti,
@@ -189,7 +219,7 @@ public:
 		                 const std::string op,
 				 SpinMatrixField &stn_corr);
  template <class mobj>
-  static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
+  static void SigmaToNucleonNonEye(const PropagatorField &qq_ti,
 				 const PropagatorField &qq_tf,
 				 const mobj &Du_spec,
 				 const PropagatorField &qd_tf,
@@ -217,7 +247,7 @@ const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
 //This is the old version
 template <class FImpl>
 template <class mobj, class robj>
-void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
+void BaryonUtils<FImpl>::BaryonSite(const mobj &D1,
                const mobj &D2,
                const mobj &D3,
                         const Gamma GammaA_i,
@@ -329,12 +359,132 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
    }}
 }

+//New version without parity projection or trace
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::BaryonSiteMatrix(const mobj &D1,
+                const mobj &D2,
+                const mobj &D3,
+                         const Gamma GammaA_i,
+                         const Gamma GammaB_i,
+                         const Gamma GammaA_f,
+                         const Gamma GammaB_f,
+                const bool * wick_contraction,
+                robj &result)
+{
+
+    auto D1_GAi =  D1 * GammaA_i;
+    auto GAf_D1_GAi = GammaA_f * D1_GAi;
+    auto GBf_D1_GAi = GammaB_f * D1_GAi;
+
+    auto D2_GBi = D2 * GammaB_i;
+    auto GBf_D2_GBi = GammaB_f * D2_GBi;
+    auto GAf_D2_GBi = GammaA_f * D2_GBi;
+
+    auto GBf_D3 = GammaB_f * D3;
+    auto GAf_D3 = GammaA_f * D3;
+
+    for (int ie_f=0; ie_f < 6 ; ie_f++){
+        int a_f = epsilon[ie_f][0]; //a
+        int b_f = epsilon[ie_f][1]; //b
+        int c_f = epsilon[ie_f][2]; //c
+    for (int ie_i=0; ie_i < 6 ; ie_i++){
+        int a_i = epsilon[ie_i][0]; //a'
+        int b_i = epsilon[ie_i][1]; //b'
+        int c_i = epsilon[ie_i][2]; //c'
+
+        Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+        //This is the \delta_{456}^{123} part
+        if (wick_contraction[0]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+                auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
+                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()(rho_f,rho_i)() += ee  * GAf_D1_GAi_rr_cc
+                                        * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
+                                        * GBf_D3    ()(alpha_f,beta_i)(b_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{231} part
+        if (wick_contraction[1]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto GBf_D2_GBi_ab_ba = GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() += ee  * D1_GAi_ar_ac
+                                        * GBf_D2_GBi_ab_ba
+                                        * GAf_D3        ()(rho_f,beta_i)(c_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{312} part
+        if (wick_contraction[2]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto D3_ab_ab = D3 ()(alpha_f,beta_i)(a_f,b_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() += ee  * GBf_D1_GAi_ar_bc
+                                        * GAf_D2_GBi    ()(rho_f,beta_i)(c_f,a_i)
+                                        * D3_ab_ab;
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{132} part
+        if (wick_contraction[3]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+                auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
+                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()(rho_f,rho_i)() -= ee  * GAf_D1_GAi_rr_cc
+                                        * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
+                                        * D3            ()(alpha_f,beta_i)(a_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{321} part
+        if (wick_contraction[4]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto D2_GBi_ab_aa = D2_GBi()(alpha_f,beta_i)(a_f,a_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() -= ee  * GBf_D1_GAi_ar_bc
+                                        * D2_GBi_ab_aa
+                                        * GAf_D3    ()(rho_f,beta_i)(c_f,b_i);
+                }}
+            }}
+        }   
+        //This is the \delta_{456}^{213} part
+        if (wick_contraction[5]){
+            for (int rho_i=0; rho_i<Ns; rho_i++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                  auto GBf_D3_ab_bb = GBf_D3()(alpha_f,beta_i)(b_f,b_i);
+                for (int rho_f=0; rho_f<Ns; rho_f++){
+                    result()(rho_f,rho_i)() -= ee  * D1_GAi_ar_ac
+                                        * GAf_D2_GBi    ()(rho_f,beta_i)(c_f,a_i)
+                                        * GBf_D3_ab_bb;
+                }}
+            }}
+        }
+    }}
+}
+
 /* Computes which wick contractions should be performed for a    *
 * baryon 2pt function given the initial and finals state quark  *
 * flavours.                                                     *
 * The array wick_contractions must be of length 6               */
 template<class FImpl>
-void BaryonUtils<FImpl>::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) {
+void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, bool* wick_contractions) {
    const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
    for (int ie=0; ie < 6 ; ie++) {
        wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
@@ -364,11 +514,6 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,

  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
-  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
-  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
-  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
-  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
 
  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");

@@ -397,13 +542,62 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
    auto D2 = v2[ss];
    auto D3 = v3[ss];
    vobj result=Zero();
-    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
+    BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
    vbaryon_corr[ss] = result; 
  }  );//end loop over lattice sites

  t += usecond();

-  std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
+  std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
+}
+
+template<class FImpl>
+void BaryonUtils<FImpl>::ContractBaryonsMatrix(const PropagatorField &q1_left,
+             const PropagatorField &q2_left,
+             const PropagatorField &q3_left,
+                         const Gamma GammaA_left,
+                         const Gamma GammaB_left,
+                         const Gamma GammaA_right,
+                         const Gamma GammaB_right,
+             const bool* wick_contractions,
+             SpinMatrixField &baryon_corr)
+{
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+ 
+  GridBase *grid = q1_left.Grid();
+  
+  autoView(vbaryon_corr, baryon_corr,CpuWrite);
+  autoView( v1 , q1_left, CpuRead);
+  autoView( v2 , q2_left, CpuRead);
+  autoView( v3 , q3_left, CpuRead);
+
+  // Real bytes =0.;
+  // bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
+  // for (int ie=0; ie < 6 ; ie++){
+  //   if(ie==0 or ie==3){
+  //      bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
+  //   }
+  //   else{
+  //      bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
+  //   }
+  // }
+  // Real t=0.;
+  // t =-usecond();
+
+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+    sobj result=Zero();
+    BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+
+  // t += usecond();
+
+  // std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;

 }

@@ -414,7 +608,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 * Wick_Contractions function above                               */
 template <class FImpl>
 template <class mobj, class robj>
-void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
+void BaryonUtils<FImpl>::ContractBaryonsSliced(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -429,16 +623,33 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,

  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
-  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
-  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
-  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
-  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
 
  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");

  for (int t=0; t<nt; t++) {
-    baryon_site(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
+    BaryonSite(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
+  }
+}
+
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::ContractBaryonsSlicedMatrix(const mobj &D1,
+             const mobj &D2,
+             const mobj &D3,
+                         const Gamma GammaA_left,
+                         const Gamma GammaB_left,
+                         const Gamma GammaA_right,
+                         const Gamma GammaB_right,
+             const bool* wick_contractions,
+             const int nt,
+             robj &result)
+{
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
+  for (int t=0; t<nt; t++) {
+    BaryonSiteMatrix(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result[t]);
  }
 }

@@ -454,7 +665,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
+void BaryonUtils<FImpl>::BaryonGamma3ptGroup1Site(
                        const mobj &Dq1_ti,
                        const mobj2 &Dq2_spec,
                        const mobj2 &Dq3_spec,
@@ -546,7 +757,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
 * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
+void BaryonUtils<FImpl>::BaryonGamma3ptGroup2Site(
                        const mobj2 &Dq1_spec,
                        const mobj &Dq2_ti,
                        const mobj2 &Dq3_spec,
@@ -636,7 +847,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
 * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
+void BaryonUtils<FImpl>::BaryonGamma3ptGroup3Site(
                        const mobj2 &Dq1_spec,
                        const mobj2 &Dq2_spec,
                        const mobj &Dq3_ti,
@@ -728,7 +939,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
 * https://aportelli.github.io/Hadrons-doc/#/mcontraction        */
 template<class FImpl>
 template <class mobj>
-void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
+void BaryonUtils<FImpl>::BaryonGamma3pt(
                        const PropagatorField &q_ti,
                        const mobj &Dq_spec1,
                        const mobj &Dq_spec2,
@@ -751,7 +962,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
            auto Dq_ti = vq_ti[ss];
            auto Dq_tf = vq_tf[ss];
            sobj result=Zero();
-            Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
            vcorr[ss] += result; 
        });//end loop over lattice sites
    } else if (group == 2) {
@@ -759,7 +970,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
            auto Dq_ti = vq_ti[ss];
            auto Dq_tf = vq_tf[ss];
            sobj result=Zero();
-            Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
            vcorr[ss] += result; 
        });//end loop over lattice sites
    } else if (group == 3) {
@@ -767,7 +978,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
            auto Dq_ti = vq_ti[ss];
            auto Dq_tf = vq_tf[ss];
            sobj result=Zero();
-            Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);

            vcorr[ss] += result; 
        });//end loop over lattice sites
@@ -787,7 +998,7 @@ void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
 * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
+void BaryonUtils<FImpl>::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -838,7 +1049,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
 * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
+void BaryonUtils<FImpl>::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -897,7 +1108,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
 * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
+void BaryonUtils<FImpl>::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
 						 const mobj &Ds_ti,
@@ -948,7 +1159,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
 * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
 template <class mobj, class mobj2, class robj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
+void BaryonUtils<FImpl>::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
 						 const mobj &Du_tf,
 						 const mobj2 &Du_spec,
 						 const mobj &Dd_tf,
@@ -1002,7 +1213,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,

 template<class FImpl>
 template <class mobj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
+void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
 						 const mobj &Du_spec,
 						 const PropagatorField &qd_tf,
 						 const PropagatorField &qs_ti,
@@ -1029,9 +1240,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
    auto Ds_ti = vs_ti[ss];
    sobj result=Zero();
    if(op == "Q1"){
-      Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
    } else if(op == "Q2"){
-      Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
    } else {
      assert(0 && "Weak Operator not correctly specified");
    }
@@ -1041,7 +1252,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,

 template<class FImpl>
 template <class mobj>
-void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
+void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
 						 const PropagatorField &qq_tf,
 						 const mobj &Du_spec,
 						 const PropagatorField &qd_tf,
@@ -1071,9 +1282,9 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
    auto Ds_ti = vs_ti[ss];
    sobj result=Zero();
    if(op == "Q1"){
-      Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
    } else if(op == "Q2"){
-      Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
    } else {
      assert(0 && "Weak Operator not correctly specified");
    }
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -53,6 +53,24 @@ namespace PeriodicBC {
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }

+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
+  {
+    return Cshift(adj(Link), mu, -1);
+  }
+
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
+  {
+    return Link;
+  }
+
+  template<class gauge> Lattice<gauge>
+  ShiftStaple(const Lattice<gauge> &Link, int mu)
+  {
+    return Cshift(Link, mu, 1);
+  }
+  
  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
    auto  CovShiftForward(const Lattice<gauge> &Link, 
 			  int mu,
@@ -70,6 +88,7 @@ namespace PeriodicBC {
    return CovShiftBackward(Link,mu,arg);
  }

+
 }


@@ -139,6 +158,38 @@ namespace ConjugateBC {
    //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    tmp = adj(Link);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return Cshift(tmp, mu, -1); // moves towards positive mu
+  }
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu) {
+    return Link;
+  }
+
+  template<class gauge> Lattice<gauge>
+  ShiftStaple(const Lattice<gauge> &Link, int mu)
+  {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    tmp = Cshift(Link, mu, 1);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return tmp;
+  }

  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
    auto  CovShiftForward(const Lattice<gauge> &Link, 
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
    decltype(coalescedRead(y_v[ss+sp])) tmp;
-    spProj5m(tmp,y_v(ss+sp));
-    tmp = a*x_v(ss+s)+b*tmp;
+    spProj5m(tmp,y_v(ss+sp)); 
+   tmp = a*x_v(ss+s)+b*tmp;
    coalescedWrite(z_v[ss+s],tmp);
  });
 }
@@ -188,7 +188,6 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
-  Gamma G5(Gamma::Algebra::Gamma5);
  autoView( x_v, x, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
@@ -196,7 +195,13 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    uint64_t ss = sss*Ls;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
+      auto tmp = x_v(ss+s);
+      decltype(tmp) tmp_p;
+      decltype(tmp) tmp_m;
+      spProj5p(tmp_p,tmp);
+      spProj5m(tmp_m,tmp);
+      // Use of spProj5m, 5p captures the coarse space too
+      coalescedWrite(z_v[ss+sp],tmp_p - tmp_m);
    }
  });
 }
@@ -208,10 +213,20 @@ void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

-  Gamma G5(Gamma::Algebra::Gamma5);
-  z = G5 * x;
+  autoView( x_v, x, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
+  uint64_t nloop = grid->oSites();
+  accelerator_for(ss,nloop,vobj::Nsimd(),{
+    auto tmp = x_v(ss);
+    decltype(tmp) tmp_p;
+    decltype(tmp) tmp_m;
+    spProj5p(tmp_p,tmp);
+    spProj5m(tmp_m,tmp);
+    coalescedWrite(z_v[ss],tmp_p - tmp_m);
+  });
 }

+/*
 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
@@ -234,6 +249,7 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
    }
  });
 }
+*/

 NAMESPACE_END(Grid);

--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@@ -449,7 +449,8 @@ public:
    LatticeReal alpha(grid);

    //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
-    alpha = toReal(2.0 * xi);
+    xi = 2.0 *xi;
+    alpha = toReal(xi);

    do {
      // A. Generate two uniformly distributed pseudo-random numbers R and R',
@@ -734,7 +735,6 @@ public:
    }
  }

-
  template <typename GaugeField>
  static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) {
    typedef typename GaugeField::vector_type vector_type;
@@ -799,6 +799,88 @@ public:
  }
 };

+template<int N>
+LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  auto lvol = grid->lSites();
+  LatticeComplexD ret(grid);
+
+  autoView(Umu_v,Umu,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  thread_for(site,lvol,{
+    Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
+    Coordinate lcoor;
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
+    peekLocalSite(Us, Umu_v, lcoor);
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	EigenU(i,j) = Us()()(i,j);
+      }}
+    ComplexD det = EigenU.determinant();
+    pokeLocalSite(det,ret_v,lcoor);
+  });
+  return ret;
+}
+template<int N>
+static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
+{
+  Umu      = ProjectOnGroup(Umu);
+  auto det = Determinant(Umu);
+
+  det = conjugate(det);
+
+  for(int i=0;i<N;i++){
+    auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
+    element = element * det;
+    PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
+  }
+}
+template<int N>
+static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplexD, N> >,Nd> > &U)
+{
+  GridBase *grid=U.Grid();
+  // Reunitarise
+  for(int mu=0;mu<Nd;mu++){
+    auto Umu = PeekIndex<LorentzIndex>(U,mu);
+    Umu      = ProjectOnGroup(Umu);
+    ProjectSUn(Umu);
+    PokeIndex<LorentzIndex>(U,Umu,mu);
+  }
+}
+// Explicit specialisation for SU(3).
+// Explicit specialisation for SU(3).
+static void
+ProjectSU3 (Lattice<iScalar<iScalar<iMatrix<vComplexD, 3> > > > &Umu)
+{
+  GridBase *grid=Umu.Grid();
+  const int x=0;
+  const int y=1;
+  const int z=2;
+  // Reunitarise
+  Umu = ProjectOnGroup(Umu);
+  autoView(Umu_v,Umu,CpuWrite);
+  thread_for(ss,grid->oSites(),{
+      auto cm = Umu_v[ss];
+      cm()()(2,x) = adj(cm()()(0,y)*cm()()(1,z)-cm()()(0,z)*cm()()(1,y)); //x= yz-zy
+      cm()()(2,y) = adj(cm()()(0,z)*cm()()(1,x)-cm()()(0,x)*cm()()(1,z)); //y= zx-xz
+      cm()()(2,z) = adj(cm()()(0,x)*cm()()(1,y)-cm()()(0,y)*cm()()(1,x)); //z= xy-yx
+      Umu_v[ss]=cm;
+  });
+}
+static void ProjectSU3(Lattice<iVector<iScalar<iMatrix<vComplexD, 3> >,Nd> > &U)
+{
+  GridBase *grid=U.Grid();
+  // Reunitarise
+  for(int mu=0;mu<Nd;mu++){
+    auto Umu = PeekIndex<LorentzIndex>(U,mu);
+    Umu      = ProjectOnGroup(Umu);
+    ProjectSU3(Umu);
+    PokeIndex<LorentzIndex>(U,Umu,mu);
+  }
+}
+
 typedef SU<2> SU2;
 typedef SU<3> SU3;
 typedef SU<4> SU4;
--- a/Grid/serialisation/JSON_IO.cc
+++ b/Grid/serialisation/JSON_IO.cc
@@ -26,7 +26,7 @@
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
-#ifndef __NVCC__
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))

 NAMESPACE_BEGIN(Grid);

--- a/Grid/simd/Fujitsu_A64FX_asm_double.h
+++ b/Grid/simd/Fujitsu_A64FX_asm_double.h
@@ -1,779 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: Fujitsu_A64FX_asm_double.h
-
-    Copyright (C) 2020
-
-Author: Nils Meyer <nils.meyer@ur.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXd(base)  
-#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A)  
-#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  
-#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A)  
-#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  
-#define PF_GAUGE(A)  
-#define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A)  
-#define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A)  
-#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
-#define LOCK_GAUGE(A)  
-#define UNLOCK_GAUGE(A)  
-#define MASK_REGS                      DECLARATIONS_A64FXd  
-#define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)  
-#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)  
-#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd  
-#define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
-#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)  
-#define XP_PROJ                        XP_PROJ_A64FXd  
-#define YP_PROJ                        YP_PROJ_A64FXd  
-#define ZP_PROJ                        ZP_PROJ_A64FXd  
-#define TP_PROJ                        TP_PROJ_A64FXd  
-#define XM_PROJ                        XM_PROJ_A64FXd  
-#define YM_PROJ                        YM_PROJ_A64FXd  
-#define ZM_PROJ                        ZM_PROJ_A64FXd  
-#define TM_PROJ                        TM_PROJ_A64FXd  
-#define XP_RECON                       XP_RECON_A64FXd  
-#define XM_RECON                       XM_RECON_A64FXd  
-#define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXd  
-#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXd  
-#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXd  
-#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXd  
-#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXd  
-#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXd  
-#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXd  
-#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXd  
-#define PERMUTE_DIR0                   0  
-#define PERMUTE_DIR1                   1  
-#define PERMUTE_DIR2                   2  
-#define PERMUTE_DIR3                   3  
-#define PERMUTE                        PERMUTE_A64FXd;  
-#define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; }  
-#define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }  
-// DECLARATIONS
-#define DECLARATIONS_A64FXd  \
-    const uint64_t lut[4][8] = { \
-        {4, 5, 6, 7, 0, 1, 2, 3}, \
-        {2, 3, 0, 1, 6, 7, 4, 5}, \
-        {1, 0, 3, 2, 5, 4, 7, 6}, \
-        {0, 1, 2, 4, 5, 6, 7, 8} };\
-asm ( \
-    "fmov z31.d , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// RESULT
-#define RESULT_A64FXd(base)  \
-{ \
-asm ( \
-    "str z0, [%[storeptr], -6, mul vl] \n\t" \
-    "str z1, [%[storeptr], -5, mul vl] \n\t" \
-    "str z2, [%[storeptr], -4, mul vl] \n\t" \
-    "str z3, [%[storeptr], -3, mul vl] \n\t" \
-    "str z4, [%[storeptr], -2, mul vl] \n\t" \
-    "str z5, [%[storeptr], -1, mul vl] \n\t" \
-    "str z6, [%[storeptr], 0, mul vl] \n\t" \
-    "str z7, [%[storeptr], 1, mul vl] \n\t" \
-    "str z8, [%[storeptr], 2, mul vl] \n\t" \
-    "str z9, [%[storeptr], 3, mul vl] \n\t" \
-    "str z10, [%[storeptr], 4, mul vl] \n\t" \
-    "str z11, [%[storeptr], 5, mul vl] \n\t" \
-    :  \
-    : [storeptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L2 (prefetch to L2)
-#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L1 (prefetch to L1)
-#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L2 (prefetch to L2)
-#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
-{ \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
-{ \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHI
-#define LOAD_CHI_A64FXd(base)  \
-{ \
-asm ( \
-    "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU
-#define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
-{ \
-asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0213
-#define LOAD_CHIMU_0213_A64FXd  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0312
-#define LOAD_CHIMU_0312_A64FXd  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_TABLE0
-#define LOAD_TABLE0  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE1
-#define LOAD_TABLE1  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE2
-#define LOAD_TABLE2  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE3
-#define LOAD_TABLE3  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (3) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PERMUTE
-#define PERMUTE_A64FXd  \
-asm ( \
-    "tbl z12.d, { z12.d }, z30.d \n\t"  \
-    "tbl z13.d, { z13.d }, z30.d \n\t"  \
-    "tbl z14.d, { z14.d }, z30.d \n\t"  \
-    "tbl z15.d, { z15.d }, z30.d \n\t"  \
-    "tbl z16.d, { z16.d }, z30.d \n\t"  \
-    "tbl z17.d, { z17.d }, z30.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-{ \
-asm ( \
-    "ptrue p5.d \n\t" \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN
-#define MULT_2SPIN_1_A64FXd(A)  \
-{ \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-asm ( \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
-    "movprfx z18.d, p5/m, z31.d \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \
-    "movprfx z21.d, p5/m, z31.d \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \
-    "movprfx z19.d, p5/m, z31.d \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \
-    "movprfx z22.d, p5/m, z31.d \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \
-    "movprfx z20.d, p5/m, z31.d \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \
-    "movprfx z23.d, p5/m, z31.d \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \
-    "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN_BACKEND
-#define MULT_2SPIN_2_A64FXd  \
-{ \
-asm ( \
-    "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \
-    "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \
-    "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \
-    "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \
-    "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \
-    "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \
-    "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \
-    "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \
-    "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \
-    "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \
-    "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \
-    "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \
-    "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \
-    "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \
-    "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \
-    "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \
-    "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \
-    "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_PROJ
-#define XP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_RECON
-#define XP_RECON_A64FXd  \
-asm ( \
-    "movprfx z6.d, p5/m, z31.d \n\t" \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
-    "movprfx z7.d, p5/m, z31.d \n\t" \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
-    "movprfx z8.d, p5/m, z31.d \n\t" \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
-    "movprfx z9.d, p5/m, z31.d \n\t" \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
-    "movprfx z10.d, p5/m, z31.d \n\t" \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
-    "movprfx z11.d, p5/m, z31.d \n\t" \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
-    "mov z0.d, p5/m, z18.d \n\t" \
-    "mov z1.d, p5/m, z19.d \n\t" \
-    "mov z2.d, p5/m, z20.d \n\t" \
-    "mov z3.d, p5/m, z21.d \n\t" \
-    "mov z4.d, p5/m, z22.d \n\t" \
-    "mov z5.d, p5/m, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// XP_RECON_ACCUM
-#define XP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_PROJ
-#define YP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fsub z12.d, p5/m, z12.d, z21.d \n\t" \
-    "fsub z13.d, p5/m, z13.d, z22.d \n\t" \
-    "fsub z14.d, p5/m, z14.d, z23.d \n\t" \
-    "fadd z15.d, p5/m, z15.d, z18.d \n\t"  \
-    "fadd z16.d, p5/m, z16.d, z19.d \n\t"  \
-    "fadd z17.d, p5/m, z17.d, z20.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZP_PROJ
-#define ZP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TP_PROJ
-#define TP_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fadd z12.d, p5/m, z12.d, z18.d \n\t"  \
-    "fadd z13.d, p5/m, z13.d, z19.d \n\t"  \
-    "fadd z14.d, p5/m, z14.d, z20.d \n\t"  \
-    "fadd z15.d, p5/m, z15.d, z21.d \n\t"  \
-    "fadd z16.d, p5/m, z16.d, z22.d \n\t"  \
-    "fadd z17.d, p5/m, z17.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_PROJ
-#define XM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON
-#define XM_RECON_A64FXd  \
-asm ( \
-    "movprfx z6.d, p5/m, z31.d \n\t" \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
-    "movprfx z7.d, p5/m, z31.d \n\t" \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
-    "movprfx z8.d, p5/m, z31.d \n\t" \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
-    "movprfx z9.d, p5/m, z31.d \n\t" \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
-    "movprfx z10.d, p5/m, z31.d \n\t" \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
-    "movprfx z11.d, p5/m, z31.d \n\t" \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
-    "mov z0.d, p5/m, z18.d \n\t" \
-    "mov z1.d, p5/m, z19.d \n\t" \
-    "mov z2.d, p5/m, z20.d \n\t" \
-    "mov z3.d, p5/m, z21.d \n\t" \
-    "mov z4.d, p5/m, z22.d \n\t" \
-    "mov z5.d, p5/m, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_PROJ
-#define YM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fadd z12.d, p5/m, z12.d, z21.d \n\t"  \
-    "fadd z13.d, p5/m, z13.d, z22.d \n\t"  \
-    "fadd z14.d, p5/m, z14.d, z23.d \n\t"  \
-    "fsub z15.d, p5/m, z15.d, z18.d \n\t" \
-    "fsub z16.d, p5/m, z16.d, z19.d \n\t" \
-    "fsub z17.d, p5/m, z17.d, z20.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZM_PROJ
-#define ZM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \
-    "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \
-    "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \
-    "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \
-    "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \
-    "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TM_PROJ
-#define TM_PROJ_A64FXd  \
-{ \
-asm ( \
-    "ptrue p5.d \n\t" \
-    "fsub z12.d, p5/m, z12.d, z18.d \n\t" \
-    "fsub z13.d, p5/m, z13.d, z19.d \n\t" \
-    "fsub z14.d, p5/m, z14.d, z20.d \n\t" \
-    "fsub z15.d, p5/m, z15.d, z21.d \n\t" \
-    "fsub z16.d, p5/m, z16.d, z22.d \n\t" \
-    "fsub z17.d, p5/m, z17.d, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON_ACCUM
-#define XM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \
-    "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \
-    "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \
-    "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \
-    "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \
-    "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_RECON_ACCUM
-#define YP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fsub z9.d, p5/m, z9.d, z18.d \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fsub z10.d, p5/m, z10.d, z19.d \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fsub z11.d, p5/m, z11.d, z20.d \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fadd z6.d, p5/m, z6.d, z21.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fadd z7.d, p5/m, z7.d, z22.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fadd z8.d, p5/m, z8.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_RECON_ACCUM
-#define YM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fadd z9.d, p5/m, z9.d, z18.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fadd z10.d, p5/m, z10.d, z19.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fadd z11.d, p5/m, z11.d, z20.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fsub z6.d, p5/m, z6.d, z21.d \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fsub z7.d, p5/m, z7.d, z22.d \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fsub z8.d, p5/m, z8.d, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZP_RECON_ACCUM
-#define ZP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZM_RECON_ACCUM
-#define ZM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TP_RECON_ACCUM
-#define TP_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TM_RECON_ACCUM
-#define TM_RECON_ACCUM_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z18.d \n\t"  \
-    "fsub z6.d, p5/m, z6.d, z18.d \n\t" \
-    "fadd z1.d, p5/m, z1.d, z19.d \n\t"  \
-    "fsub z7.d, p5/m, z7.d, z19.d \n\t" \
-    "fadd z2.d, p5/m, z2.d, z20.d \n\t"  \
-    "fsub z8.d, p5/m, z8.d, z20.d \n\t" \
-    "fadd z3.d, p5/m, z3.d, z21.d \n\t"  \
-    "fsub z9.d, p5/m, z9.d, z21.d \n\t" \
-    "fadd z4.d, p5/m, z4.d, z22.d \n\t"  \
-    "fsub z10.d, p5/m, z10.d, z22.d \n\t" \
-    "fadd z5.d, p5/m, z5.d, z23.d \n\t"  \
-    "fsub z11.d, p5/m, z11.d, z23.d \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZERO_PSI
-#define ZERO_PSI_A64FXd  \
-asm ( \
-    "ptrue p5.d \n\t" \
-    "fmov z0.d , 0 \n\t" \
-    "fmov z1.d , 0 \n\t" \
-    "fmov z2.d , 0 \n\t" \
-    "fmov z3.d , 0 \n\t" \
-    "fmov z4.d , 0 \n\t" \
-    "fmov z5.d , 0 \n\t" \
-    "fmov z6.d , 0 \n\t" \
-    "fmov z7.d , 0 \n\t" \
-    "fmov z8.d , 0 \n\t" \
-    "fmov z9.d , 0 \n\t" \
-    "fmov z10.d , 0 \n\t" \
-    "fmov z11.d , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
-#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
-#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \
-{ \
-asm ( \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// ADD_RESULT_INTERNAL
-#define ADD_RESULT_INTERNAL_A64FXd  \
-asm ( \
-    "fadd z0.d, p5/m, z0.d, z12.d \n\t"  \
-    "fadd z1.d, p5/m, z1.d, z13.d \n\t"  \
-    "fadd z2.d, p5/m, z2.d, z14.d \n\t"  \
-    "fadd z3.d, p5/m, z3.d, z15.d \n\t"  \
-    "fadd z4.d, p5/m, z4.d, z16.d \n\t"  \
-    "fadd z5.d, p5/m, z5.d, z17.d \n\t"  \
-    "fadd z6.d, p5/m, z6.d, z18.d \n\t"  \
-    "fadd z7.d, p5/m, z7.d, z19.d \n\t"  \
-    "fadd z8.d, p5/m, z8.d, z20.d \n\t"  \
-    "fadd z9.d, p5/m, z9.d, z21.d \n\t"  \
-    "fadd z10.d, p5/m, z10.d, z22.d \n\t"  \
-    "fadd z11.d, p5/m, z11.d, z23.d \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
--- a/Grid/simd/Fujitsu_A64FX_asm_single.h
+++ b/Grid/simd/Fujitsu_A64FX_asm_single.h
@@ -1,779 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: Fujitsu_A64FX_asm_single.h
-
-    Copyright (C) 2020
-
-Author: Nils Meyer <nils.meyer@ur.de>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#define LOAD_CHIMU(base)               LOAD_CHIMU_INTERLEAVED_A64FXf(base)  
-#define PREFETCH_CHIMU_L1(A)           PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A)  
-#define PREFETCH_GAUGE_L1(A)           PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  
-#define PREFETCH_CHIMU_L2(A)           PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A)  
-#define PREFETCH_GAUGE_L2(A)           PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  
-#define PF_GAUGE(A)  
-#define PREFETCH_RESULT_L2_STORE(A)    PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A)  
-#define PREFETCH_RESULT_L1_STORE(A)    PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A)  
-#define PREFETCH1_CHIMU(A)             PREFETCH_CHIMU_L1(A)  
-#define PREFETCH_CHIMU(A)              PREFETCH_CHIMU_L1(A)  
-#define LOCK_GAUGE(A)  
-#define UNLOCK_GAUGE(A)  
-#define MASK_REGS                      DECLARATIONS_A64FXf  
-#define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)  
-#define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)  
-#define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf  
-#define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
-#define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)  
-#define XP_PROJ                        XP_PROJ_A64FXf  
-#define YP_PROJ                        YP_PROJ_A64FXf  
-#define ZP_PROJ                        ZP_PROJ_A64FXf  
-#define TP_PROJ                        TP_PROJ_A64FXf  
-#define XM_PROJ                        XM_PROJ_A64FXf  
-#define YM_PROJ                        YM_PROJ_A64FXf  
-#define ZM_PROJ                        ZM_PROJ_A64FXf  
-#define TM_PROJ                        TM_PROJ_A64FXf  
-#define XP_RECON                       XP_RECON_A64FXf  
-#define XM_RECON                       XM_RECON_A64FXf  
-#define XM_RECON_ACCUM                 XM_RECON_ACCUM_A64FXf  
-#define YM_RECON_ACCUM                 YM_RECON_ACCUM_A64FXf  
-#define ZM_RECON_ACCUM                 ZM_RECON_ACCUM_A64FXf  
-#define TM_RECON_ACCUM                 TM_RECON_ACCUM_A64FXf  
-#define XP_RECON_ACCUM                 XP_RECON_ACCUM_A64FXf  
-#define YP_RECON_ACCUM                 YP_RECON_ACCUM_A64FXf  
-#define ZP_RECON_ACCUM                 ZP_RECON_ACCUM_A64FXf  
-#define TP_RECON_ACCUM                 TP_RECON_ACCUM_A64FXf  
-#define PERMUTE_DIR0                   0  
-#define PERMUTE_DIR1                   1  
-#define PERMUTE_DIR2                   2  
-#define PERMUTE_DIR3                   3  
-#define PERMUTE                        PERMUTE_A64FXf;  
-#define LOAD_TABLE(Dir)                if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; }  
-#define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }  
-// DECLARATIONS
-#define DECLARATIONS_A64FXf  \
-    const uint32_t lut[4][16] = { \
-        {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
-        {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
-        {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \
-        {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \
-asm ( \
-    "fmov z31.s , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// RESULT
-#define RESULT_A64FXf(base)  \
-{ \
-asm ( \
-    "str z0, [%[storeptr], -6, mul vl] \n\t" \
-    "str z1, [%[storeptr], -5, mul vl] \n\t" \
-    "str z2, [%[storeptr], -4, mul vl] \n\t" \
-    "str z3, [%[storeptr], -3, mul vl] \n\t" \
-    "str z4, [%[storeptr], -2, mul vl] \n\t" \
-    "str z5, [%[storeptr], -1, mul vl] \n\t" \
-    "str z6, [%[storeptr], 0, mul vl] \n\t" \
-    "str z7, [%[storeptr], 1, mul vl] \n\t" \
-    "str z8, [%[storeptr], 2, mul vl] \n\t" \
-    "str z9, [%[storeptr], 3, mul vl] \n\t" \
-    "str z10, [%[storeptr], 4, mul vl] \n\t" \
-    "str z11, [%[storeptr], 5, mul vl] \n\t" \
-    :  \
-    : [storeptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L2 (prefetch to L2)
-#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_CHIMU_L1 (prefetch to L1)
-#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L2 (prefetch to L2)
-#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
-{ \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
-asm ( \
-    "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \
-    "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_GAUGE_L1 (prefetch to L1)
-#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
-{ \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-asm ( \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHI
-#define LOAD_CHI_A64FXf(base)  \
-{ \
-asm ( \
-    "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU
-#define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \
-{ \
-asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0213
-#define LOAD_CHIMU_0213_A64FXf  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_CHIMU_0312
-#define LOAD_CHIMU_0312_A64FXf  \
-{ \
-    const SiteSpinor & ref(in[offset]); \
-asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \
-    "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \
-    "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \
-    "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \
-    "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (&ref[2][0]) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// LOAD_TABLE0
-#define LOAD_TABLE0  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (0) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE1
-#define LOAD_TABLE1  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (1) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE2
-#define LOAD_TABLE2  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (2) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_TABLE3
-#define LOAD_TABLE3  \
-asm ( \
-    "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \
-    :  \
-    : [tableptr] "r" (&lut[0]),[index] "i" (3) \
-    : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PERMUTE
-#define PERMUTE_A64FXf  \
-asm ( \
-    "tbl z12.s, { z12.s }, z30.s \n\t"  \
-    "tbl z13.s, { z13.s }, z30.s \n\t"  \
-    "tbl z14.s, { z14.s }, z30.s \n\t"  \
-    "tbl z15.s, { z15.s }, z30.s \n\t"  \
-    "tbl z16.s, { z16.s }, z30.s \n\t"  \
-    "tbl z17.s, { z17.s }, z30.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-{ \
-asm ( \
-    "ptrue p5.s \n\t" \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN
-#define MULT_2SPIN_1_A64FXf(A)  \
-{ \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-asm ( \
-    "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \
-    "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \
-    "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \
-    "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \
-    "movprfx z18.s, p5/m, z31.s \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \
-    "movprfx z21.s, p5/m, z31.s \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \
-    "movprfx z19.s, p5/m, z31.s \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \
-    "movprfx z22.s, p5/m, z31.s \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \
-    "movprfx z20.s, p5/m, z31.s \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \
-    "movprfx z23.s, p5/m, z31.s \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \
-    "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \
-    "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \
-    "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (baseU + 2 * 3 * 64) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// MULT_2SPIN_BACKEND
-#define MULT_2SPIN_2_A64FXf  \
-{ \
-asm ( \
-    "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \
-    "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \
-    "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \
-    "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \
-    "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \
-    "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \
-    "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \
-    "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \
-    "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \
-    "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \
-    "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \
-    "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \
-    "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \
-    "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \
-    "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \
-    "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \
-    "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \
-    "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_PROJ
-#define XP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XP_RECON
-#define XP_RECON_A64FXf  \
-asm ( \
-    "movprfx z6.s, p5/m, z31.s \n\t" \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
-    "movprfx z7.s, p5/m, z31.s \n\t" \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
-    "movprfx z8.s, p5/m, z31.s \n\t" \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
-    "movprfx z9.s, p5/m, z31.s \n\t" \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
-    "movprfx z10.s, p5/m, z31.s \n\t" \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
-    "movprfx z11.s, p5/m, z31.s \n\t" \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
-    "mov z0.s, p5/m, z18.s \n\t" \
-    "mov z1.s, p5/m, z19.s \n\t" \
-    "mov z2.s, p5/m, z20.s \n\t" \
-    "mov z3.s, p5/m, z21.s \n\t" \
-    "mov z4.s, p5/m, z22.s \n\t" \
-    "mov z5.s, p5/m, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// XP_RECON_ACCUM
-#define XP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_PROJ
-#define YP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fsub z12.s, p5/m, z12.s, z21.s \n\t" \
-    "fsub z13.s, p5/m, z13.s, z22.s \n\t" \
-    "fsub z14.s, p5/m, z14.s, z23.s \n\t" \
-    "fadd z15.s, p5/m, z15.s, z18.s \n\t"  \
-    "fadd z16.s, p5/m, z16.s, z19.s \n\t"  \
-    "fadd z17.s, p5/m, z17.s, z20.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZP_PROJ
-#define ZP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TP_PROJ
-#define TP_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fadd z12.s, p5/m, z12.s, z18.s \n\t"  \
-    "fadd z13.s, p5/m, z13.s, z19.s \n\t"  \
-    "fadd z14.s, p5/m, z14.s, z20.s \n\t"  \
-    "fadd z15.s, p5/m, z15.s, z21.s \n\t"  \
-    "fadd z16.s, p5/m, z16.s, z22.s \n\t"  \
-    "fadd z17.s, p5/m, z17.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_PROJ
-#define XM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON
-#define XM_RECON_A64FXf  \
-asm ( \
-    "movprfx z6.s, p5/m, z31.s \n\t" \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
-    "movprfx z7.s, p5/m, z31.s \n\t" \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
-    "movprfx z8.s, p5/m, z31.s \n\t" \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
-    "movprfx z9.s, p5/m, z31.s \n\t" \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
-    "movprfx z10.s, p5/m, z31.s \n\t" \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
-    "movprfx z11.s, p5/m, z31.s \n\t" \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
-    "mov z0.s, p5/m, z18.s \n\t" \
-    "mov z1.s, p5/m, z19.s \n\t" \
-    "mov z2.s, p5/m, z20.s \n\t" \
-    "mov z3.s, p5/m, z21.s \n\t" \
-    "mov z4.s, p5/m, z22.s \n\t" \
-    "mov z5.s, p5/m, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_PROJ
-#define YM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fadd z12.s, p5/m, z12.s, z21.s \n\t"  \
-    "fadd z13.s, p5/m, z13.s, z22.s \n\t"  \
-    "fadd z14.s, p5/m, z14.s, z23.s \n\t"  \
-    "fsub z15.s, p5/m, z15.s, z18.s \n\t" \
-    "fsub z16.s, p5/m, z16.s, z19.s \n\t" \
-    "fsub z17.s, p5/m, z17.s, z20.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// ZM_PROJ
-#define ZM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \
-    "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \
-    "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \
-    "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \
-    "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \
-    "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// TM_PROJ
-#define TM_PROJ_A64FXf  \
-{ \
-asm ( \
-    "ptrue p5.s \n\t" \
-    "fsub z12.s, p5/m, z12.s, z18.s \n\t" \
-    "fsub z13.s, p5/m, z13.s, z19.s \n\t" \
-    "fsub z14.s, p5/m, z14.s, z20.s \n\t" \
-    "fsub z15.s, p5/m, z15.s, z21.s \n\t" \
-    "fsub z16.s, p5/m, z16.s, z22.s \n\t" \
-    "fsub z17.s, p5/m, z17.s, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); \
-}
-// XM_RECON_ACCUM
-#define XM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \
-    "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \
-    "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \
-    "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \
-    "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \
-    "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YP_RECON_ACCUM
-#define YP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fsub z9.s, p5/m, z9.s, z18.s \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fsub z10.s, p5/m, z10.s, z19.s \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fsub z11.s, p5/m, z11.s, z20.s \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fadd z6.s, p5/m, z6.s, z21.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fadd z7.s, p5/m, z7.s, z22.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fadd z8.s, p5/m, z8.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// YM_RECON_ACCUM
-#define YM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fadd z9.s, p5/m, z9.s, z18.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fadd z10.s, p5/m, z10.s, z19.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fadd z11.s, p5/m, z11.s, z20.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fsub z6.s, p5/m, z6.s, z21.s \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fsub z7.s, p5/m, z7.s, z22.s \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fsub z8.s, p5/m, z8.s, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZP_RECON_ACCUM
-#define ZP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZM_RECON_ACCUM
-#define ZM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TP_RECON_ACCUM
-#define TP_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// TM_RECON_ACCUM
-#define TM_RECON_ACCUM_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z18.s \n\t"  \
-    "fsub z6.s, p5/m, z6.s, z18.s \n\t" \
-    "fadd z1.s, p5/m, z1.s, z19.s \n\t"  \
-    "fsub z7.s, p5/m, z7.s, z19.s \n\t" \
-    "fadd z2.s, p5/m, z2.s, z20.s \n\t"  \
-    "fsub z8.s, p5/m, z8.s, z20.s \n\t" \
-    "fadd z3.s, p5/m, z3.s, z21.s \n\t"  \
-    "fsub z9.s, p5/m, z9.s, z21.s \n\t" \
-    "fadd z4.s, p5/m, z4.s, z22.s \n\t"  \
-    "fsub z10.s, p5/m, z10.s, z22.s \n\t" \
-    "fadd z5.s, p5/m, z5.s, z23.s \n\t"  \
-    "fsub z11.s, p5/m, z11.s, z23.s \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// ZERO_PSI
-#define ZERO_PSI_A64FXf  \
-asm ( \
-    "ptrue p5.s \n\t" \
-    "fmov z0.s , 0 \n\t" \
-    "fmov z1.s , 0 \n\t" \
-    "fmov z2.s , 0 \n\t" \
-    "fmov z3.s , 0 \n\t" \
-    "fmov z4.s , 0 \n\t" \
-    "fmov z5.s , 0 \n\t" \
-    "fmov z6.s , 0 \n\t" \
-    "fmov z7.s , 0 \n\t" \
-    "fmov z8.s , 0 \n\t" \
-    "fmov z9.s , 0 \n\t" \
-    "fmov z10.s , 0 \n\t" \
-    "fmov z11.s , 0 \n\t" \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
-#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// PREFETCH_RESULT_L1_STORE (prefetch store to L1)
-#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \
-{ \
-asm ( \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \
-    "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \
-    :  \
-    : [fetchptr] "r" (base) \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \
-); \
-}
-// ADD_RESULT_INTERNAL
-#define ADD_RESULT_INTERNAL_A64FXf  \
-asm ( \
-    "fadd z0.s, p5/m, z0.s, z12.s \n\t"  \
-    "fadd z1.s, p5/m, z1.s, z13.s \n\t"  \
-    "fadd z2.s, p5/m, z2.s, z14.s \n\t"  \
-    "fadd z3.s, p5/m, z3.s, z15.s \n\t"  \
-    "fadd z4.s, p5/m, z4.s, z16.s \n\t"  \
-    "fadd z5.s, p5/m, z5.s, z17.s \n\t"  \
-    "fadd z6.s, p5/m, z6.s, z18.s \n\t"  \
-    "fadd z7.s, p5/m, z7.s, z19.s \n\t"  \
-    "fadd z8.s, p5/m, z8.s, z20.s \n\t"  \
-    "fadd z9.s, p5/m, z9.s, z21.s \n\t"  \
-    "fadd z10.s, p5/m, z10.s, z22.s \n\t"  \
-    "fadd z11.s, p5/m, z11.s, z23.s \n\t"  \
-    :  \
-    :  \
-    : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \
-); 
-
--- a/Grid/simd/Fujitsu_A64FX_intrin_double.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXd  
-#define SAVE_RESULT(A,B)               RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXd(A);  
 #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXd(Dir)  
 #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXd  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXd(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXd  
 #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base)  
 #define XP_PROJ                        XP_PROJ_A64FXd  
 #define YP_PROJ                        YP_PROJ_A64FXd  
@@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define MAYBEPERM(Dir,perm)            if (Dir != 3) { if (perm) { PERMUTE; } }  
 // DECLARATIONS
 #define DECLARATIONS_A64FXd  \
+    uint64_t baseU; \
    const uint64_t lut[4][8] = { \
        {4, 5, 6, 7, 0, 1, 2, 3}, \
        {2, 3, 0, 1, 6, 7, 4, 5}, \
@@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // RESULT
 #define RESULT_A64FXd(base)  \
 { \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \
-    svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \
+    svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \
 }
 // PREFETCH_CHIMU_L2 (prefetch to L2)
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
 }
 // PREFETCH_CHIMU_L1 (prefetch to L1)
 #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
 }
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
-    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
+    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
 }
 // LOAD_CHI
 #define LOAD_CHI_A64FXd(base)  \
 { \
-    Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64));  \
-    Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64));  \
-    Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64));  \
-    Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64));  \
-    Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64));  \
-    Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64));  \
+    Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0));  \
+    Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1));  \
+    Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2));  \
+    Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3));  \
+    Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4));  \
+    Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5));  \
 }
 // LOAD_CHIMU
 #define LOAD_CHIMU_INTERLEAVED_A64FXd(base)  \
 { \
-    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_CHIMU_0213
 #define LOAD_CHIMU_0213_A64FXd  \
 { \
    const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
-    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
+    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 }
 // LOAD_CHIMU_0312
 #define LOAD_CHIMU_0312_A64FXd  \
 { \
    const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_TABLE0
 #define LOAD_TABLE0  \
@@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
    Chi_12 = svtbl(Chi_12, table0);    

 // LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+#define LOAD_GAUGE(A)  \
 { \
-    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 }
 // MULT_2SPIN
 #define MULT_2SPIN_1_A64FXd(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
    UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
    UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
    UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
@@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
-    U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \
-    U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \
-    U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \
+    U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \
+    U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \
+    U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \
 }
 // MULT_2SPIN_BACKEND
 #define MULT_2SPIN_2_A64FXd  \
@@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
    result_31 = svdup_f64(0.); \
    result_32 = svdup_f64(0.); 

-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
+// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
 }
 // PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base)  \
--- a/Grid/simd/Fujitsu_A64FX_intrin_single.h
+++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h
@@ -38,10 +38,11 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define LOCK_GAUGE(A)  
 #define UNLOCK_GAUGE(A)  
 #define MASK_REGS                      DECLARATIONS_A64FXf  
-#define SAVE_RESULT(A,B)               RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)  
+#define SAVE_RESULT(A,B)               RESULT_A64FXf(A);  
 #define MULT_2SPIN_1(Dir)              MULT_2SPIN_1_A64FXf(Dir)  
 #define MULT_2SPIN_2                   MULT_2SPIN_2_A64FXf  
 #define LOAD_CHI(base)                 LOAD_CHI_A64FXf(base)  
+#define ZERO_PSI                       ZERO_PSI_A64FXf  
 #define ADD_RESULT(base,basep)         LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base)  
 #define XP_PROJ                        XP_PROJ_A64FXf  
 #define YP_PROJ                        YP_PROJ_A64FXf  
@@ -70,6 +71,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #define MAYBEPERM(A,perm)              if (perm) { PERMUTE; }  
 // DECLARATIONS
 #define DECLARATIONS_A64FXf  \
+    uint64_t baseU; \
    const uint32_t lut[4][16] = { \
        {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \
        {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \
@@ -126,114 +128,114 @@ Author: Nils Meyer <nils.meyer@ur.de>
 // RESULT
 #define RESULT_A64FXf(base)  \
 { \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31);  \
-    svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31);  \
+    svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32);  \
 }
 // PREFETCH_CHIMU_L2 (prefetch to L2)
 #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \
 }
 // PREFETCH_CHIMU_L1 (prefetch to L1)
 #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \
 }
 // PREFETCH_GAUGE_L2 (prefetch to L2)
 #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \
-    svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \
+    const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \
 }
 // PREFETCH_GAUGE_L1 (prefetch to L1)
 #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-    svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \
-    svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \
+    svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \
 }
 // LOAD_CHI
 #define LOAD_CHI_A64FXf(base)  \
 { \
-    Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64));  \
-    Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64));  \
-    Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64));  \
-    Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64));  \
-    Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64));  \
-    Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64));  \
+    Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0));  \
+    Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1));  \
+    Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2));  \
+    Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3));  \
+    Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4));  \
+    Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5));  \
 }
 // LOAD_CHIMU
 #define LOAD_CHIMU_INTERLEAVED_A64FXf(base)  \
 { \
-    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_CHIMU_0213
 #define LOAD_CHIMU_0213_A64FXf  \
 { \
    const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
-    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
+    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
 }
 // LOAD_CHIMU_0312
 #define LOAD_CHIMU_0312_A64FXf  \
 { \
    const SiteSpinor & ref(in[offset]); \
-    Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64));  \
-    Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64));  \
-    Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64));  \
-    Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64));  \
-    Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64));  \
-    Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64));  \
-    Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64));  \
-    Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64));  \
-    Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64));  \
-    Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64));  \
-    Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64));  \
-    Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64));  \
+    Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6));  \
+    Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3));  \
+    Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5));  \
+    Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4));  \
+    Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4));  \
+    Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5));  \
+    Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3));  \
+    Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0));  \
+    Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2));  \
+    Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1));  \
+    Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1));  \
+    Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2));  \
 }
 // LOAD_TABLE0
 #define LOAD_TABLE0  \
@@ -261,26 +263,26 @@ Author: Nils Meyer <nils.meyer@ur.de>
    Chi_12 = svtbl(Chi_12, table0);    

 // LOAD_GAUGE
-#define LOAD_GAUGE  \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
+#define LOAD_GAUGE(A)  \
 { \
-    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
 }
 // MULT_2SPIN
 #define MULT_2SPIN_1_A64FXf(A)  \
 { \
-    const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \
-    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64));  \
-    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64));  \
-    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64));  \
-    U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64));  \
-    U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64));  \
-    U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64));  \
+    const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \
+    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6));  \
+    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3));  \
+    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0));  \
+    U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5));  \
+    U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2));  \
+    U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1));  \
    UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \
    UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \
    UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \
@@ -293,9 +295,9 @@ Author: Nils Meyer <nils.meyer@ur.de>
    UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \
    UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \
    UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \
-    U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64));  \
-    U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64));  \
-    U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64));  \
+    U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4));  \
+    U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1));  \
+    U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2));  \
 }
 // MULT_2SPIN_BACKEND
 #define MULT_2SPIN_2_A64FXf  \
@@ -570,12 +572,12 @@ Author: Nils Meyer <nils.meyer@ur.de>
    result_31 = svdup_f32(0.); \
    result_32 = svdup_f32(0.); 

-// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
+// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing)
 #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base)  \
 { \
-    svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \
-    svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \
+    asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \
 }
 // PREFETCH_RESULT_L1_STORE (prefetch store to L1)
 #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base)  \
--- a/Grid/simd/Fujitsu_A64FX_undef.h
+++ b/Grid/simd/Fujitsu_A64FX_undef.h
@@ -46,6 +46,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
 #undef MULT_2SPIN_2
 #undef MAYBEPERM
 #undef LOAD_CHI
+#undef ZERO_PSI
 #undef XP_PROJ
 #undef YP_PROJ
 #undef ZP_PROJ
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -38,12 +38,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_HIP
 #include <hip/hip_fp16.h>
 #endif
+#ifdef GRID_SYCL
+namespace Grid {
+  typedef struct { uint16_t x;} half;
+  typedef struct { half   x; half   y;} half2;
+  typedef struct { float  x; float  y;} float2;
+  typedef struct { double x; double y;} double2;
+}
+#endif
+

 namespace Grid {

-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
-typedef struct { uint16_t x;} half;
-#endif
+
+
 typedef struct Half2_t { half x; half y; } Half2;

 #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
@@ -156,7 +164,7 @@ accelerator_inline float half2float(half h)
  f = __half2float(h);
 #else 
  Grid_half hh; 
-  hh.x = hr.x;
+  hh.x = h.x;
  f=  sfw_half_to_float(hh);
 #endif
  return f;
--- a/Grid/simd/Grid_vector_unops.h
+++ b/Grid/simd/Grid_vector_unops.h
@@ -125,14 +125,6 @@ accelerator_inline Grid_simd<S, V> sqrt(const Grid_simd<S, V> &r) {
  return SimdApply(SqrtRealFunctor<S>(), r);
 }
 template <class S, class V>
-accelerator_inline Grid_simd<S, V> rsqrt(const Grid_simd<S, V> &r) {
-  return SimdApply(RSqrtRealFunctor<S>(), r);
-}
-template <class Scalar>
-accelerator_inline Scalar rsqrt(const Scalar &r) {
-  return (RSqrtRealFunctor<Scalar>(), r);
-}
-template <class S, class V>
 accelerator_inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
  return SimdApply(CosRealFunctor<S>(), r);
 }
--- a/Grid/simd/gridverter.py
+++ b/Grid/simd/gridverter.py
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -269,7 +269,7 @@ public:
  std::vector<Vector<std::pair<int,int> > > face_table ;
  Vector<int> surface_list;

-  Vector<StencilEntry>  _entries; // Resident in managed memory
+  stencilVector<StencilEntry>  _entries; // Resident in managed memory
  std::vector<Packet> Packets;
  std::vector<Merge> Mergers;
  std::vector<Merge> MergersSHM;
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@@ -95,14 +95,18 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
  vtype nrm;
  vtype inner;
  for(int c1=0;c1<N;c1++){
+
+    // Normalises row c1
    zeroit(inner);	
    for(int c2=0;c2<N;c2++)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);

-    nrm = rsqrt(inner);
+    nrm = sqrt(inner);
+    nrm = 1.0/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
      
+    // Remove c1 from rows c1+1...N-1
    for (int b=c1+1; b<N; ++b){
      decltype(ret._internal[b][b]*ret._internal[b][b]) pr;
      zeroit(pr);
@@ -113,7 +117,19 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 	ret._internal[b][c] -= pr * ret._internal[c1][c];
      }
    }
-	  
+  }
+
+  // Normalise last row
+  {
+    int c1 = N-1;
+    zeroit(inner);	
+    for(int c2=0;c2<N;c2++)
+      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
+
+    nrm = sqrt(inner);
+    nrm = 1.0/nrm;
+    for(int c2=0;c2<N;c2++)
+      ret._internal[c1][c2]*= nrm;
  }
  // assuming the determinant is ok
  return ret;
--- a/Grid/tensors/Tensor_unary.h
+++ b/Grid/tensors/Tensor_unary.h
@@ -84,7 +84,6 @@ NAMESPACE_BEGIN(Grid);
  }

 UNARY(sqrt);
-UNARY(rsqrt);
 UNARY(sin);
 UNARY(cos);
 UNARY(asin);
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -21,22 +21,26 @@ void acceleratorInit(void)
 #define ENV_RANK_SLURM         "SLURM_PROCID"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
-  // We extract the local rank initialization using an environment variable
-  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) {
-    printf("OPENMPI detected\n");
-    rank = atoi(localRankStr);		
-  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) {
-    printf("MVAPICH detected\n");
-    rank = atoi(localRankStr);		
-  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) {
-    printf("SLURM detected\n");
-    rank = atoi(localRankStr);		
-  } else { 
-    printf("MPI version is unknown - bad things may happen\n");
-  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_SLURM  )) != NULL) { world_rank = atoi(localRankStr);}
+  // We extract the local rank initialization using an environment variable
+  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) {
+    if (!world_rank)
+      printf("OPENMPI detected\n");
+    rank = atoi(localRankStr);		
+  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) {
+    if (!world_rank)
+      printf("MVAPICH detected\n");
+    rank = atoi(localRankStr);		
+  } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) {
+    if (!world_rank)
+      printf("SLURM detected\n");
+    rank = atoi(localRankStr);		
+  } else { 
+    if (!world_rank)
+      printf("MPI version is unknown - bad things may happen\n");
+  }

  size_t totalDeviceMem=0;
  for (int i = 0; i < nDevices; i++) {
@@ -48,7 +52,7 @@ void acceleratorInit(void)
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
    if ( world_rank == 0) {
-#ifndef GRID_IBM_SUMMIT
+#ifndef GRID_DEFAULT_GPU
      if ( i==rank ) {
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device Number    : %d\n", rank,i);
@@ -73,11 +77,17 @@ void acceleratorInit(void)
 #undef GPU_PROP_FMT    
 #undef GPU_PROP

-#ifdef GRID_IBM_SUMMIT
+#ifdef GRID_DEFAULT_GPU
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
-  if ( world_rank == 0 )  printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n");
+  if ( world_rank == 0 ) {
+    printf("AcceleratorCudaInit: using default device \n");
+    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
+    printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+  }
 #else
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
+  printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n");
  cudaSetDevice(rank);
 #endif
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
@@ -139,11 +149,18 @@ void acceleratorInit(void)
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
-#ifdef GRID_IBM_SUMMIT
-  // IBM Jsrun makes cuda Device numbering screwy and not match rank
-  if ( world_rank == 0 )  printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n");
+
+#ifdef GRID_DEFAULT_GPU
+  if ( world_rank == 0 ) {
+    printf("AcceleratorHipInit: using default device \n");
+    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
+    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+  }
 #else
-  if ( world_rank == 0 )  printf("AcceleratorHipInit: setting device to node rank\n");
+  if ( world_rank == 0 ) {
+    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
+    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+  }
  hipSetDevice(rank);
 #endif
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -166,15 +166,18 @@ inline void *acceleratorAllocDevice(size_t bytes)
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline int  acceleratorIsCommunicable(void *ptr)
 {
-  int uvm;
-  auto 
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
-  assert(cuerr == cudaSuccess );
-  if(uvm) return 0;
-  else    return 1;
+  //  int uvm=0;
+  //  auto 
+  //  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
+  //  assert(cuerr == cudaSuccess );
+  //  if(uvm) return 0;
+  //  else    return 1;
+    return 1;
 }

 #endif
@@ -229,8 +232,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
 inline int  acceleratorIsCommunicable(void *ptr)
 {
 #if 0
@@ -328,10 +333,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
  return ptr;
 };

-inline void acceleratorFreeShared(void *ptr){ free(ptr);};
+inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}

 #endif

@@ -354,7 +361,7 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc
 //////////////////////////////////////////////
 // CPU Target - No accelerator just thread instead
 //////////////////////////////////////////////
-#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned 
+
 #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )

 #undef GRID_SIMT
@@ -369,8 +376,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
+inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}

 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
@@ -393,6 +402,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
 inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 #endif

+
+
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -473,11 +473,13 @@ void Grid_init(int *argc,char ***argv)
    LebesgueOrder::UseLebesgueOrder=1;
  }
  CartesianCommunicator::nCommThreads = 1;
+#ifdef GRID_COMMS_THREADS  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
    assert(CartesianCommunicator::nCommThreads > 0);
  }
+#endif  
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
--- a/33
+++ b/33
@@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu

 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```

-where `--enable-precision=` set the default precision,
-`--enable-simd=` set the SIMD type, `--enable-
+where `--enable-simd=` set the SIMD type, `--enable-
 comms=`, and `<path>` should be replaced by the prefix path where you want to
 install Grid. Other options are detailed in the next section, you can also use `configure
 --help` to display them. Like with any other program using GNU autotool, the
@@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
+- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
@@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used:
 The following configuration is recommended for the Intel Knights Landing platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto  \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi       \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation.
 We recommend four ranks per node for best performance, but optimum is local volume dependent.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
@@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
 The following configuration is recommended for the Intel Haswell platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -262,8 +257,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -280,8 +274,7 @@ This is the default.
 The following configuration is recommended for the Intel Skylake platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
@@ -298,8 +291,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -330,8 +322,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
--- a/README.md
+++ b/README.md
@@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu

 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```

-where `--enable-precision=` set the default precision,
-`--enable-simd=` set the SIMD type, `--enable-
+where `--enable-simd=` set the SIMD type, `--enable-
 comms=`, and `<path>` should be replaced by the prefix path where you want to
 install Grid. Other options are detailed in the next section, you can also use `configure
 --help` to display them. Like with any other program using GNU autotool, the
@@ -150,8 +149,8 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
+- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
@@ -205,8 +204,7 @@ Alternatively, some CPU codenames can be directly used:
 The following configuration is recommended for the Intel Knights Landing platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto  \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -216,8 +214,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi       \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -236,8 +233,7 @@ for interior communication. This is the mpi3 communications implementation.
 We recommend four ranks per node for best performance, but optimum is local volume dependent.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=KNL        \
+../configure --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
@@ -248,8 +244,7 @@ We recommend four ranks per node for best performance, but optimum is local volu
 The following configuration is recommended for the Intel Haswell platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -266,8 +261,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -284,8 +278,7 @@ This is the default.
 The following configuration is recommended for the Intel Skylake platform:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
@@ -302,8 +295,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+../configure --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -334,8 +326,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform.

 ``` bash
-../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+../configure --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
--- a/SVE_README.txt
+++ b/SVE_README.txt
@@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64

 scl enable gcc-toolset-10 bash

-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"

 * gcc 10.1 prebuild w/ MPI, QPACE4 interactive login

 scl enable gcc-toolset-10 bash
 module load mpi/openmpi-aarch64

-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"

 ------------------------------------------------------------------------------

 * armclang 20.2 (qp4)

-../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"
+../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN"

 ------------------------------------------------------------------------------

 * gcc 10.0.1 VLA (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static


 * gcc 10.0.1 fixed-size ACLE (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN"


 * gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI
@@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1
 export MPICH_CC=gcc-10.0.1
 export MPICH_CXX=g++-10.0.1

-$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"
+$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt"

 --------------------------------------------------------

 * armclang 20.0 VLA (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static

 TODO check ARMCLANGCOMPAT


 * armclang 20.1 VLA (merlin)

-../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static
+../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static

 TODO check ARMCLANGCOMPAT


 * armclang 20.1 VLA (fjt cluster)

-../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
+../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"

 TODO check ARMCLANGCOMPAT


 * armclang 20.1 VLA w/MPI (fjt cluster)

-../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"
+../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64"

 No ARMCLANGCOMPAT -> still correct ?

@@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ?

 * Fujitsu fcc

-../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"
+../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN"


 * Fujitsu fcc w/ MPI

-../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
+../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU"
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@@ -1,10 +1,18 @@
-
 #include "Benchmark_IO.hpp"

-#ifndef BENCH_IO_LMAX
-#define BENCH_IO_LMAX 40
+#ifndef BENCH_IO_LMIN
+#define BENCH_IO_LMIN 8
 #endif

+#ifndef BENCH_IO_LMAX
+#define BENCH_IO_LMAX 32
+#endif
+
+#ifndef BENCH_IO_NPASS
+#define BENCH_IO_NPASS 10
+#endif
+
+#ifdef HAVE_LIME
 using namespace Grid;

 std::string filestem(const int l)
@@ -12,64 +20,182 @@ std::string filestem(const int l)
  return "iobench_l" + std::to_string(l);
 }

+int vol(const int i)
+{
+  return BENCH_IO_LMIN + 2*i;
+}
+
+int volInd(const int l)
+{
+  return (l - BENCH_IO_LMIN)/2;
+}
+
+template <typename Mat>
+void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
+{
+  auto            nr = data[0].rows(), nc = data[0].cols();
+  Eigen::MatrixXd sqSum(nr, nc);
+  double          n = static_cast<double>(data.size());
+
+  assert(n > 1.);
+  mean  = Mat::Zero(nr, nc);
+  sqSum = Mat::Zero(nr, nc);
+  for (auto &d: data)
+  {
+    mean  += d;
+    sqSum += d.cwiseProduct(d);
+  }
+  stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt();
+  mean  /= n;
+}
+
+#define grid_printf(...) \
+{\
+  char _buf[1024];\
+  sprintf(_buf, __VA_ARGS__);\
+  MSG << _buf;\
+}
+
+enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3};
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  int64_t          threads = GridThread::GetThreads();
-  auto             mpi     = GridDefaultMpi();
-  std::vector<int> latt;
+  int64_t                      threads = GridThread::GetThreads();
+  auto                         mpi     = GridDefaultMpi();
+  unsigned int                 nVol    = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1;
+  unsigned int                 nRelVol = (BENCH_IO_LMAX - 24)/2 + 1;
+  std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4));
+  std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4));
+  std::vector<int>             latt;

  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
  MSG << "MPI partition " << mpi << std::endl;
-
-  MSG << SEP << std::endl;
-  MSG << "Benchmark std write" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i)
  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    MSG << BIGSEP << std::endl;
+    MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl;
+    MSG << BIGSEP << std::endl;
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    MSG << "-- Local volume " << l << "^4" << std::endl;
-    writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
-  }
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
+      perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }

-  MSG << SEP << std::endl;
-  MSG << "Benchmark std read" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
-  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    MSG << SEP << std::endl;
+    MSG << "Benchmark std read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    MSG << "-- Local volume " << l << "^4" << std::endl;
-    readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
-  }
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
+      perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }

-#ifdef HAVE_LIME
-  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid C-Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
-  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+  #ifdef HAVE_LIME
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime write" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    MSG << "-- Local volume " << l << "^4" << std::endl;
-    writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
-  }
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
+      perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond;
+    }

-  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid C-Lime read" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
-  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    MSG << SEP << std::endl;
+    MSG << "Benchmark Grid C-Lime read" << std::endl;
+    MSG << SEP << std::endl;
+    for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+    {
+      latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    MSG << "-- Local volume " << l << "^4" << std::endl;
-    readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
-  }
+      MSG << "-- Local volume " << l << "^4" << std::endl;
+      readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
+      perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond;
+    }
 #endif
+    avPerf[i].fill(0.);
+    for (int f = 0; f < 4; ++f)
+    for (int l = 24; l <= BENCH_IO_LMAX; l += 2)
+    {
+      avPerf[i](f) += perf[i](volInd(l), f);
+    }
+    avPerf[i] /= nRelVol;
+  }
+
+  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
+  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
+  double          n = BENCH_IO_NPASS;
+
+  stats(mean, stdDev, perf);
+  stats(avMean, avStdDev, avPerf);
+  rob.fill(100.);
+  rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs());
+  avRob.fill(100.);
+  avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs());
+
+  MSG << BIGSEP << std::endl;
+  MSG << "SUMMARY" << std::endl;
+  MSG << BIGSEP << std::endl;
+  MSG << "Summary of individual results (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "L", "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+                l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
+                mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
+                mean(volInd(l), gRead), stdDev(volInd(l), gRead),
+                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
+  }
+  MSG << std::endl;
+  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%4s %12s %12s %12s %12s\n",
+              "L", "std read", "std write", "Grid read", "Grid write");
+  for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
+  {
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
+                l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
+                rob(volInd(l), gRead), rob(volInd(l), gWrite));
+  }
+  MSG << std::endl;
+  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
+              "std read", "std dev", "std write", "std dev",
+              "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
+              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
+              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
+  MSG << std::endl;
+  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << std::endl;
+  grid_printf("%12s %12s %12s %12s\n",
+              "std read", "std write", "Grid read", "Grid write");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
+              avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));

  Grid_finalize();

  return EXIT_SUCCESS;
 }
+#else
+int main(int argc,char ** argv){}
+#endif
--- a/benchmarks/Benchmark_IO.hpp
+++ b/benchmarks/Benchmark_IO.hpp
@@ -2,10 +2,12 @@
 #define Benchmark_IO_hpp_

 #include <Grid/Grid.h>
-#ifdef HAVE_LIME
 #define MSG std::cout << GridLogMessage
 #define SEP \
+"-----------------------------------------------------------------------------"
+#define BIGSEP \
 "============================================================================="
+#ifdef HAVE_LIME

 namespace Grid {

@@ -37,9 +39,12 @@ using ReaderFn = std::function<void(Field &, const std::string)>;
 //   ioWatch.Stop();
 //   std::fclose(file);
 //   size *= vec.Grid()->ProcessorCount();
-//   MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
-//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-//       << " MB/s" << std::endl;
+//   auto &p = BinaryIO::lastPerf;
+//   p.size            = size;
+//   p.time            = ioWatch.useconds();
+//   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+//   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() 
+//       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
 //   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
 // }
 //
@@ -72,9 +77,12 @@ using ReaderFn = std::function<void(Field &, const std::string)>;
 //   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
 //   assert(crcData == crcRead);
 //   size *= vec.Grid()->ProcessorCount();
-//   MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
-//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-//       << " MB/s" << std::endl;
+//   auto &p = BinaryIO::lastPerf;
+//   p.size            = size;
+//   p.time            = ioWatch.useconds();
+//   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+//   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed() 
+//       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
 //   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
 // }

@@ -100,9 +108,12 @@ void stdWrite(const std::string filestem, Field &vec)
  file.flush();
  ioWatch.Stop();
  size *= vec.Grid()->ProcessorCount();
-  MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
-      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-      << " MB/s" << std::endl;
+  auto &p = BinaryIO::lastPerf;
+  p.size            = size;
+  p.time            = ioWatch.useconds();
+  p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
  MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
 }

@@ -135,9 +146,12 @@ void stdRead(Field &vec, const std::string filestem)
  MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
  assert(crcData == crcRead);
  size *= vec.Grid()->ProcessorCount();
-  MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
-      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-      << " MB/s" << std::endl;
+  auto &p = BinaryIO::lastPerf;
+  p.size            = size;
+  p.time            = ioWatch.useconds();
+  p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
  MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
 }

@@ -200,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename,
  auto                           simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
  std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
  std::shared_ptr<GridBase>      gPt;
+  std::random_device             rd;

  makeGrid(gPt, gBasePt, Ls, rb);

-  GridBase                       *g = gPt.get();
-  GridParallelRNG                rng(g);
-  Field                          vec(g);
+  GridBase         *g = gPt.get();
+  GridParallelRNG  rng(g);
+  Field            vec(g);
+
+  rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd()),
+                         static_cast<int>(rd()), static_cast<int>(rd())});

  random(rng, vec);
  write(filename, vec);
@@ -223,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename,

  makeGrid(gPt, gBasePt, Ls, rb);

-  GridBase                       *g = gPt.get();
-  Field                          vec(g);
+  GridBase *g = gPt.get();
+  Field    vec(g);

  read(vec, filename);
 }
--- a/benchmarks/Benchmark_IO_vs_dir.cc
+++ b/benchmarks/Benchmark_IO_vs_dir.cc
@@ -1,9 +1,5 @@
 #include "Benchmark_IO.hpp"
-
-#define MSG std::cout << GridLogMessage
-#define SEP \
-"============================================================================="
-
+#ifdef HAVE_LIME
 using namespace Grid;

 int main (int argc, char ** argv)
@@ -101,3 +97,6 @@ int main (int argc, char ** argv)

  return EXIT_SUCCESS;
 }
+#else
+int main(int argc,char ** argv){}
+#endif
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -62,7 +62,7 @@ struct time_statistics{

 void comms_header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
-            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
+            <<"bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)"<<std::endl;
 };

 Gamma::Algebra Gmu [] = {
@@ -125,7 +125,7 @@ public:
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
-	std::cout << GridLogMessage<< latt_size <<std::endl;
+
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
@@ -137,8 +137,8 @@ public:
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}

 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -189,11 +189,11 @@ public:
 	//	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;

-	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+	std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
+		 << bytes << " \t "
+		 <<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+		 << "\t\t"<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	
 	    }
@@ -202,6 +202,8 @@ public:
    return;
  }

+
+  
  static void Memory(void)
  {
    const int Nvec=8;
@@ -222,7 +224,7 @@ public:


  uint64_t lmax=32;
-#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)

    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=8){
@@ -247,11 +249,6 @@ public:
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
-	autoView( x_v , x, CpuWrite);
-	autoView( y_v , y, CpuWrite);
-	autoView( z_v , z, CpuRead);
-        x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
-        y_v[4]=z_v[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
@@ -266,6 +263,61 @@ public:
  };


+  static void SU4(void)
+  {
+    const int Nc4=4;
+    typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
+
+    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+    Coordinate mpi_layout  = GridDefaultMpi();
+    
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
+    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+  
+    uint64_t NN;
+
+
+    uint64_t lmax=32;
+#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+
+    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+    for(int lat=8;lat<=lmax;lat+=8){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      NN =Grid.NodeCount();
+
+
+      LatticeSU4 z(&Grid); z=Zero();
+      LatticeSU4 x(&Grid); x=Zero();
+      LatticeSU4 y(&Grid); y=Zero();
+      double a=2.0;
+
+      uint64_t Nloop=NLOOP;
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	z=x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000;
+     
+      double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
+      double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
+      std::cout<<GridLogMessage<<std::setprecision(3) 
+	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
+	       << "\t\t"<< bytes/time/NN <<std::endl;
+
+    }
+  };
+
+
  static double DWF(int Ls,int L)
  {
    RealD mass=0.1;
@@ -282,8 +334,9 @@ public:
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});

-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
@@ -291,11 +344,11 @@ public:
    NN_global=NN;
    uint64_t SHM=NP/NN;

-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});

    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
@@ -324,7 +377,7 @@ public:
    typedef LatticeGaugeFieldF Gauge;
    
    ///////// Source preparation ////////////
-    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
    Fermion src   (FGrid); random(RNG5,src);
    Fermion src_e (FrbGrid);
    Fermion src_o (FrbGrid);
@@ -369,7 +422,7 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = 50;
+	uint64_t ncall = 500;

 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

@@ -387,7 +440,17 @@ public:
 	FGrid->Barrier();
 	
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1344.0*volume)/2;
+
+	// Nc=3 gives
+	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
+	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
+	//	double flops=(1344.0*volume)/2;
+#if 1
+	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2;
+#else
+	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns  + 2*Nd*Nc*Ns*2;
+#endif
+	double flops=(fps*volume)/2;
 	double mf_hi, mf_lo, mf_err;

 	timestat.statistics(t_time);
@@ -402,6 +465,7 @@ public:
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;

+	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
@@ -438,8 +502,9 @@ public:
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
@@ -447,7 +512,6 @@ public:
    NN_global=NN;
    uint64_t SHM=NP/NN;

-    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});

    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -478,7 +542,7 @@ public:
    typedef typename Action::FermionField Fermion; 
    typedef LatticeGaugeFieldF Gauge;
    
-    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 

    typename Action::ImplParams params;
    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
@@ -596,11 +660,12 @@ int main (int argc, char ** argv)
 #endif
  Benchmark::Decomposition();

+  int do_su4=1;
  int do_memory=1;
  int do_comms =1;

-  int sel=2;
-  std::vector<int> L_list({16,24,32});
+  int sel=4;
+  std::vector<int> L_list({8,12,16,24,32});
  int selm1=sel-1;

  std::vector<double> wilson;
@@ -624,7 +689,6 @@ int main (int argc, char ** argv)
    dwf4.push_back(result);
  }

-  /*
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -632,14 +696,13 @@ int main (int argc, char ** argv)
    double result = Benchmark::Staggered(L_list[l]) ;
    staggered.push_back(result);
  }
-  */

  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

@@ -651,6 +714,13 @@ int main (int argc, char ** argv)
    Benchmark::Memory();
  }

+  if ( do_su4 ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    Benchmark::SU4();
+  }
+  
  if ( do_comms && (NN>1) ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
@@ -661,9 +731,9 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " <<std::endl;
    for(int l=0;l<L_list.size();l++){
-      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
    }
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -94,8 +94,8 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;

-      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
-      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);

      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
--- a/benchmarks/Benchmark_comms_host_device.cc
+++ b/benchmarks/Benchmark_comms_host_device.cc
@@ -0,0 +1,260 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_comms.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+struct time_statistics{
+  double mean;
+  double err;
+  double min;
+  double max;
+
+  void statistics(std::vector<double> v){
+      double sum = std::accumulate(v.begin(), v.end(), 0.0);
+      mean = sum / v.size();
+
+      std::vector<double> diff(v.size());
+      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+      auto result = std::minmax_element(v.begin(), v.end());
+      min = *result.first;
+      max = *result.second;
+}
+};
+
+void header(){
+  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
+            <<std::setw(11)<<"bytes\t\t"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  int Nloop=250;
+  int nmu=0;
+  int maxlat=32;
+  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+
+  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
+  std::vector<double> t_time(Nloop);
+  time_statistics timestat;
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=8;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],
+	                      lat*mpi_layout[1],
+      			      lat*mpi_layout[2],
+      			      lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8);
+
+      for(int mu=0;mu<8;mu++){
+	xbuf[mu].resize(lat*lat*lat*Ls);
+	rbuf[mu].resize(lat*lat*lat*Ls);
+      }
+      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+      int ncomm;
+
+      for(int mu=0;mu<4;mu++){
+	if (mpi_layout[mu]>1 ) {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+
+	  ncomm=0;
+	
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+
+	    comm_proc = mpi_layout[mu]-1;
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+        double mean=(stop-start)/Nloop;      
+      double dbytes    = bytes*ppn;
+      double xbytes    = dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
+               <<std::right<< xbytes/mean<<"  "
+               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+
+
+	
+	}
+      }
+
+
+      
+    }
+  }
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory "<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=8;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],
+	                      lat*mpi_layout[1],
+      			      lat*mpi_layout[2],
+      			      lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+
+      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+      }
+
+      int ncomm;
+
+      for(int mu=0;mu<4;mu++){
+	if (mpi_layout[mu]>1 ) {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+
+	  ncomm=0;
+	
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+
+	    comm_proc = mpi_layout[mu]-1;
+	    {
+	      std::vector<CommsRequest_t> requests;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+	    }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+        double mean=(stop-start)/Nloop;      
+      double dbytes    = bytes*ppn;
+      double xbytes    = dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)<<" "
+               <<std::right<< xbytes/mean<<"  "
+               << "\t\t"<<std::setw(7)<< bidibytes/mean<< std::endl;
+
+
+	
+	}
+      }
+
+      for(int d=0;d<8;d++){
+	acceleratorFreeDevice(xbuf[d]);
+	acceleratorFreeDevice(rbuf[d]);
+      }
+
+      
+    }
+  }
+
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+
+  Grid_finalize();
+}
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -108,7 +108,7 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
  Umu=1.0;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -0,0 +1,364 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  int threads = GridThread::GetThreads();
+
+  Coordinate latt4 = GridDefaultLatt();
+  int Ls=8;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionF src   (FGrid); random(RNG5,src);
+#if 0
+  src = Zero();
+  {
+    Coordinate origin({0,0,0,latt4[2]-1,0});
+    SpinColourVectorF tmp;
+    tmp=Zero();
+    tmp()(0)(0)=Complex(-2.0,0.0);
+    std::cout << " source site 0 " << tmp<<std::endl;
+    pokeSite(tmp,src,origin);
+  }
+#else
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+#endif
+
+
+  LatticeFermionF result(FGrid); result=Zero();
+  LatticeFermionF    ref(FGrid);    ref=Zero();
+  LatticeFermionF    tmp(FGrid);
+  LatticeFermionF    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+#if 0
+  Umu=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    LatticeColourMatrixF ttmp(UGrid);
+    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
+    //    if (mu !=2 ) ttmp = 0;
+    //    ttmp = ttmp* pow(10.0,mu);
+    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
+  }
+  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
+#endif
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  // replicate across fifth dimension
+  LatticeGaugeFieldF Umu5d(FGrid);
+  std::vector<LatticeColourMatrixF> U(4,FGrid);
+  {
+    autoView( Umu5d_v, Umu5d, CpuWrite);
+    autoView( Umu_v  , Umu  , CpuRead);
+    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
+      for(int s=0;s<Ls;s++){
+	Umu5d_v[Ls*ss+s] = Umu_v[ss];
+      }
+    }
+  }
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+  }
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
+
+  if (1)
+  {
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =1000;
+
+  if (1) {
+    FGrid->Barrier();
+    Dw.ZeroCounters();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      Dw.Dhop(src,result,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=single_site_flops*volume*ncall;
+
+    auto nsimd = vComplex::Nsimd();
+    auto simdwidth = sizeof(vComplex);
+
+    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
+    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
+    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
+    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
+    err = ref-result;
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    //exit(0);
+
+    if(( norm2(err)>1.0e-4) ) {
+      /*
+      std::cout << "RESULT\n " << result<<std::endl;
+      std::cout << "REF   \n " << ref   <<std::endl;
+      std::cout << "ERR   \n " << err   <<std::endl;
+      */
+      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
+      FGrid->Barrier();
+      exit(-1);
+    }
+    assert (norm2(err)< 1.0e-4 );
+    Dw.Report();
+  }
+
+  if (1)
+  { // Naive wilson dag implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      {
+	autoView( ref_v, ref, CpuWrite);
+	autoView( tmp_v, tmp, CpuRead);
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+    ref = -0.5*ref;
+  }
+  //  dump=1;
+  Dw.Dhop(src,result,1);
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
+  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  err = ref-result;
+  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+/*
+	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
+	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
+	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
+*/
+  }
+  LatticeFermionF src_e (FrbGrid);
+  LatticeFermionF src_o (FrbGrid);
+  LatticeFermionF r_e   (FrbGrid);
+  LatticeFermionF r_o   (FrbGrid);
+  LatticeFermionF r_eo  (FGrid);
+
+  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  pickCheckerboard(Even,src_e,src);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
+
+
+  // S-direction is INNERMOST and takes no part in the parity.
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    Dw.ZeroCounters();
+    FGrid->Barrier();
+    Dw.DhopEO(src_o,r_e,DaggerNo);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+#ifdef CUDA_PROFILE
+      if(i==10) cudaProfilerStart();
+#endif
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+#ifdef CUDA_PROFILE
+      if(i==20) cudaProfilerStop();
+#endif
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(single_site_flops*volume*ncall)/2.0;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
+    Dw.Report();
+  }
+  Dw.DhopEO(src_o,r_e,DaggerNo);
+  Dw.DhopOE(src_e,r_o,DaggerNo);
+  Dw.Dhop  (src  ,result,DaggerNo);
+
+  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
+  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
+  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
+
+  setCheckerboard(r_eo,r_o);
+  setCheckerboard(r_eo,r_e);
+
+  err = r_eo-result;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  if((norm2(err)>1.0e-4)){
+    /*
+	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
+	std::cout<< "Deo REF\n " <<result  << std::endl;
+	std::cout<< "Deo ERR   \n " << err <<std::endl;
+    */
+  }
+
+  pickCheckerboard(Even,src_e,err);
+  pickCheckerboard(Odd,src_o,err);
+  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
+  Grid_finalize();
+  exit(0);
+}
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -24,7 +24,7 @@ typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-
+#ifdef ENABLE_GPARITY
  int Ls=16;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
@@ -63,7 +63,7 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeFieldF Umu(UGrid); 
-  SU3::HotConfiguration(RNG4,Umu); 
+  SU<Nc>::HotConfiguration(RNG4,Umu); 
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;

  RealD mass=0.1;
@@ -184,7 +184,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    DwD.Report();
  }
-
+#endif
  Grid_finalize();
 }

--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -30,7 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;
+


 int main (int argc, char ** argv)
@@ -53,7 +53,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Seeded"<<std::endl;

-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);

  std::cout << GridLogMessage << "made random gauge fields"<<std::endl;

--- a/benchmarks/benchmark-io-csv.sh
+++ b/benchmarks/benchmark-io-csv.sh
@@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-
-awkscript='
-BEGIN{
-  i = 0;
-  print "local L,std read (MB/s),std write (MB/s),Grid Lime read (MB/s),Grid Lime write (MB/s)"
-}
-
-/Benchmark std write/{
-  i    = 0; 
-  mode = "stdWrite";
-} 
-
-/Benchmark std read/{
-  i    = 0; 
-  mode = "stdRead"
-} 
-
-/Benchmark Grid C-Lime write/{
-  i    = 0; 
-  mode = "gridWrite";
-} 
-
-/Benchmark Grid C-Lime read/{
-  i    = 0; 
-  mode = "gridRead";
-}
-
-/Local volume/{
-  match($0, "[0-9]+\\^4");
-  l[i] = substr($0, RSTART, RLENGTH-2);
-}
-
-/MB\/s/{
-  match($0, "[0-9.eE]+ MB/s");
-  p = substr($0, RSTART, RLENGTH-5);
-  if (mode == "stdWrite")
-  {
-    sw[i] = p;
-  }
-  else if (mode == "stdRead")
-  {
-    sr[i] = p;
-  }
-  else if (mode == "gridWrite")
-  {
-    gw[i] = p;
-  }
-  else if (mode == "gridRead")
-  {
-    gr[i] = p;
-  }
-  i++;
-}
-
-END{
-  s = 0
-  for (a in l)
-  {
-    s++;
-  }
-  for (j = 0; j < s; j++)
-  {
-    printf("%s,%s,%s,%s,%s\n", l[j], sr[j], sw[j], gr[j], gw[j]);
-  }
-  printf("\n");
-}
-'
-
-if (( $# != 1 )); then
-    echo "usage: `basename $0` <log file>" 1>&2
-    exit 1
-fi
-LOG=$1
-
-awk "${awkscript}" ${LOG} 
--- a/configure.ac
+++ b/configure.ac
@@ -123,6 +123,42 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### fermions
+AC_ARG_ENABLE([fermion-reps],
+     [AC_HELP_STRING([--fermion-reps=yes|no], [enable extra fermion representation support])],
+     [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
+
+AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ])
+
+AC_ARG_ENABLE([gparity],
+     [AC_HELP_STRING([--enable-gparity=yes|no], [enable G-parity support])],
+     [ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes])
+
+AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ])
+case ${ac_FERMION_REPS} in
+   yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);;
+esac
+case ${ac_GPARITY} in
+   yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);;
+esac
+############### Nc
+AC_ARG_ENABLE([Nc],
+    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
+    [ac_Nc=${enable_Nc}], [ac_Nc=3])
+
+case ${ac_Nc} in
+    2)
+        AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
+    3)
+        AC_DEFINE([Config_Nc],[3],[Gauge group Nc]);;
+    4)
+        AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);;
+    5)
+        AC_DEFINE([Config_Nc],[5],[Gauge group Nc]);;
+    *)
+      AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
+esac
+
 ############### FP16 conversions
 AC_ARG_ENABLE([sfw-fp16],
    [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
@@ -135,18 +171,28 @@ case ${ac_SFW_FP16} in
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac

-############### SUMMIT JSRUN
-AC_ARG_ENABLE([summit],
-    [AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])],
-    [ac_SUMMIT=${enable_summit}], [ac_SUMMIT=no])
-case ${ac_SUMMIT} in
-    no);;
+############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
+AC_ARG_ENABLE([accelerator-cshift],
+    [AC_HELP_STRING([--enable-accelerator-cshift=yes|no], [run cshift on the device])],
+    [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
+
+AC_ARG_ENABLE([ucx-buggy],
+    [AC_HELP_STRING([--enable-ucx-buggy=yes|no], [enable workaround for UCX device buffer bugs])],
+    [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
+
+case ${ac_UCXBUGGY} in
    yes)
-      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
-    *)
-      AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);;
+    ac_ACC_CSHIFT=no;;
+    *);;
 esac

+case ${ac_ACC_CSHIFT} in
+    yes)
+      AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
+    *);;
+esac
+
+
 ############### SYCL/CUDA/HIP/none
 AC_ARG_ENABLE([accelerator],
    [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])],
@@ -163,8 +209,9 @@ case ${ac_ACCELERATOR} in
      echo HIP acceleration
      AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);;
    none)
-      echo NO acceleration
-    ;;
+      echo NO acceleration    ;;
+    no)
+      echo NO acceleration    ;;
    *)
      AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);;
 esac
@@ -459,27 +506,48 @@ esac
 AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
 AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"

-############### Precision selection
-AC_ARG_ENABLE([precision],
-              [AC_HELP_STRING([--enable-precision=single|double],
-                              [Select default word size of Real])],
-              [ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
+###### PRECISION ALWAYS DOUBLE
+AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )

-case ${ac_PRECISION} in
-     single)
-       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
-     ;;
-     double)
-       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
-     ;;
-     *)
-     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
-     ;;
+#########################################################
+######################  GRID ALLOCATOR ALIGNMENT ##
+#########################################################
+AC_ARG_ENABLE([alloc-align],[AC_HELP_STRING([--enable-alloc-align=2MB|4k],
+              [Alignment in bytes of GRID Allocator ])],[ac_ALLOC_ALIGN=${enable_alloc_align}],[ac_ALLOC_ALIGN=2MB])
+case ${ac_ALLOC_ALIGN} in
+    4k)
+     AC_DEFINE([GRID_ALLOC_ALIGN],[(4096)],[GRID_ALLOC_ALIGN]);;
+    2MB)
+     AC_DEFINE([GRID_ALLOC_ALIGN],[(2*1024*1024)],[GRID_ALLOC_ALIGN]);;
+    *);;
 esac

-######################  Shared memory allocation technique under MPI3
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone],
-              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
+AC_ARG_ENABLE([alloc-cache],[AC_HELP_STRING([--enable-alloc-cache ],
+              [Cache a pool of recent "frees" to reuse])],[ac_ALLOC_CACHE=${enable_alloc_cache}],[ac_ALLOC_CACHE=yes])
+case ${ac_ALLOC_CACHE} in
+    yes)
+     AC_DEFINE([ALLOCATION_CACHE],[1],[ALLOCATION_CACHE]);;
+    *);;
+esac
+
+
+#########################################################
+######################  set GPU device to rank in node ##
+#########################################################
+AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice],
+              [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
+case ${ac_SETDEVICE} in
+    yes);;
+    no)
+     AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] )
+    ;;
+esac
+
+#########################################################
+######################  Shared memory intranode #########
+#########################################################
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no],
+              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])

 case ${ac_SHM} in

@@ -498,10 +566,14 @@ case ${ac_SHM} in
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
     ;;

-     shmnone)
+     shmnone | no)
     AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
     ;;

+     nvlink)
+     AC_DEFINE([GRID_MPI3_SHM_NVLINK],[1],[GRID_MPI3_SHM_NVLINK] )
+     ;;
+
     hugetlbfs)
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
     ;;
@@ -518,10 +590,32 @@ AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
 	      [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
 AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])

+############### force MPI in SMP
+AC_ARG_ENABLE([shm-force-mpi],[AC_HELP_STRING([--enable-shm-force-mpi],
+              [Force MPI within shared memory])],[ac_SHM_FORCE_MPI=${enable_shm_force_mpi}],[ac_SHM_FORCE_MPI=no])
+case ${ac_SHM_FORCE_MPI} in
+     yes)
+        AC_DEFINE([GRID_SHM_FORCE_MPI],[1],[GRID_SHM_FORCE_MPI] )
+      ;;
+     *) ;;
+esac
+
+############### communication type selection
+AC_ARG_ENABLE([comms-threads],[AC_HELP_STRING([--enable-comms-threads | --disable-comms-threads],
+              [Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
+
+case ${ac_COMMS_THREADS} in
+     yes)
+        AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
+      ;;
+     *) ;;
+esac
+
 ############### communication type selection
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto],
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

+
 case ${ac_COMMS} in
     none)
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
@@ -656,6 +750,7 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
+Nc                          : ${ac_Nc}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Acceleration                : ${ac_ACCELERATOR}
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu

 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:

-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Debug

 #### 2. `Release`

-Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
+Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`:

-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Release

 #### 3. `MPIDebug`

 Debug configuration with MPI:

-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug

 ### 5.3 Build Grid

--- a/documentation/manual.rst
+++ b/documentation/manual.rst
@@ -178,15 +178,10 @@ Then enter the cloned directory and set up the build system::
 Now you can execute the `configure` script to generate makefiles (here from a build directory)::

  mkdir build; cd build
-  ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \
+  ../configure --enable-simd=AVX --enable-comms=mpi-auto \
      --prefix=<path>

-where::
-
-  --enable-precision=single|double
-
-sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations,
-and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.::
+::

   --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX

@@ -236,7 +231,7 @@ Detailed build configuration options
  --enable-mkl[=path]                     use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
  --enable-simd=code                      setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below.
  --enable-gen-simd-width=size            select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes.
-  --enable-precision=single|double        set the default precision (default: `double`).
+  --enable-precision=single|double        set the default precision (default: `double`). **Deprecated option**
  --enable-comms=mpi|none                 use `<comm>` for message passing (default: `none`).
  --enable-rng=sitmo|ranlux48|mt19937     choose the RNG (default: `sitmo`).
  --disable-timers                        disable system dependent high-resolution timers.
@@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform

 The following configuration is recommended for the Intel Knights Landing platform::

-  ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+  ../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto  \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.

 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::

-  ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+  ../configure --enable-simd=KNL        \
             --enable-comms=mpi       \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared
 for interior communication.
 We recommend four ranks per node for best performance, but optimum is local volume dependent. ::

-   ../configure --enable-precision=double\
-             --enable-simd=KNL        \
+   ../configure --enable-simd=KNL        \
             --enable-comms=mpi-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
@@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform

 The following configuration is recommended for the Intel Haswell platform::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
             --enable-comms=mpi-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
@@ -360,8 +351,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.

 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
             --enable-comms=mpi      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform

 The following configuration is recommended for the Intel Skylake platform::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+  ../configure --enable-simd=AVX512     \
             --enable-comms=mpi      \
             --enable-mkl             \
             CXX=mpiicpc
@@ -396,8 +385,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed.

 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::

-  ../configure --enable-precision=double\
-             --enable-simd=AVX512     \
+  ../configure --enable-simd=AVX512     \
             --enable-comms=mpi      \
             --enable-mkl             \
             CXX=CC CC=cc
@@ -422,8 +410,7 @@ and 8 threads per rank.
 The following configuration is recommended for the AMD EPYC platform::


-  ../configure --enable-precision=double\
-             --enable-simd=AVX2       \
+  ../configure --enable-simd=AVX2       \
             --enable-comms=mpi \
             CXX=mpicxx 

--- a/scripts/filelist
+++ b/scripts/filelist
@@ -6,13 +6,27 @@ home=`pwd`
 cd $home/Grid
 HFILES=`find . -type f -name '*.h' -not -name '*Hdf5*' -not -path '*/gamma-gen/*' -not -path '*/Old/*' -not -path '*/Eigen/*'`
 HFILES="$HFILES"
-CCFILES=`find . -name '*.cc' -not -path '*/gamma-gen/*' -not -name '*Communicator*.cc' -not -name '*SharedMemory*.cc' -not -name '*Hdf5*'`
+CCFILES=`find . -name '*.cc' -not -path '*/instantiation/*/*' -not -path '*/gamma-gen/*' -not -name '*Communicator*.cc' -not -name '*SharedMemory*.cc' -not -name '*Hdf5*'`
+
+
+ZWILS_FERMION_FILES=` find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/ZWilsonImpl*' `
+WILS_FERMION_FILES=`  find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonImpl*' `
+STAG_FERMION_FILES=`  find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/Staggered*' `
+GP_FERMION_FILES=`    find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/Gparity*' `
+ADJ_FERMION_FILES=`   find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonAdj*' `
+TWOIND_FERMION_FILES=`find . -name '*.cc' -path '*/instantiation/*' -path '*/instantiation/WilsonTwoIndex*'`
+
 HPPFILES=`find . -type f -name '*.hpp'`
 echo HFILES=$HFILES $HPPFILES > Make.inc
 echo >> Make.inc
 echo CCFILES=$CCFILES >> Make.inc

-
+echo ZWILS_FERMION_FILES=$ZWILS_FERMION_FILES >> Make.inc
+echo WILS_FERMION_FILES=$WILS_FERMION_FILES   >> Make.inc
+echo STAG_FERMION_FILES=$STAG_FERMION_FILES   >> Make.inc
+echo GP_FERMION_FILES=$GP_FERMION_FILES   >> Make.inc
+echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES   >> Make.inc
+echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES   >> Make.inc

 # tests Make.inc
 cd $home/tests
@@ -26,11 +40,10 @@ for subdir in $dirs; do
    echo "tests-local: ${TESTLIST} " > Make.inc
    echo ${PREF}_PROGRAMS = ${TESTLIST} >> Make.inc
    echo >> Make.inc
-    HADLINK=`[ $subdir = './hadrons' ] && echo '-lHadrons '`
    for f in $TESTS; do
 	   BNAME=`basename $f .cc`
 	   echo ${BNAME}_SOURCES=$f >> Make.inc
-	   echo ${BNAME}_LDADD=${HADLINK}-lGrid  >> Make.inc
+	   echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc
 	   echo >> Make.inc
    done
    if [ $subdir != '.' ]; then
@@ -49,7 +62,7 @@ echo >> Make.inc
 for f in $TESTS; do
    BNAME=`basename $f .cc`
    echo ${BNAME}_SOURCES=$f  >> Make.inc
-    echo ${BNAME}_LDADD=-lGrid>> Make.inc
+    echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc
    echo >> Make.inc
 done
 cd ..
@@ -65,7 +78,7 @@ echo >> Make.inc
 for f in $TESTS; do
    BNAME=`basename $f .cc`
    echo ${BNAME}_SOURCES=$f  >> Make.inc
-    echo ${BNAME}_LDADD=-lGrid>> Make.inc
+    echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc
    echo >> Make.inc
 done
 cd ..
--- a/tests/IO/Test_ildg_io.cc
+++ b/tests/IO/Test_ildg_io.cc
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)

  std::vector<LatticeColourMatrix> U(4,&Fine);
  
-  SU3::HotConfiguration(pRNGa,Umu);
+  SU<Nc>::HotConfiguration(pRNGa,Umu);


  FieldMetaData header;
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -84,7 +84,7 @@ int main (int argc, char ** argv)

  std::vector<LatticeColourMatrix> U(4,&Fine);
  
-  SU3::HotConfiguration(pRNGa,Umu);
+  SU<Nc>::HotConfiguration(pRNGa,Umu);

  FieldMetaData header;
  std::string file("./ckpoint_lat.4000");
--- a/tests/Test_cayley_even_odd_vec.cc
+++ b/tests/Test_cayley_even_odd_vec.cc
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          sRNG5(sFGrid);  sRNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);

  RealD mass=0.1;
  RealD M5  =1.8;
--- a/tests/Test_compressed_lanczos_hot_start.cc
+++ b/tests/Test_compressed_lanczos_hot_start.cc
@@ -202,7 +202,7 @@ int main (int argc, char ** argv) {
  std::vector<int> seeds4({1,2,3,4});
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
  //  FieldMetaData header;
  //  NerscIO::readConfiguration(Umu,header,Params.config);

--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -71,7 +71,7 @@ int main (int argc, char ** argv)
  LatticeGaugeFieldD Umu(UGrid);
  LatticeGaugeFieldF Umu_f(UGrid_f); 
  
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);

  precisionChange(Umu_f,Umu);
  
--- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc
+++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
  LatticeGaugeFieldD Umu(UGrid);
  LatticeGaugeFieldF Umu_f(UGrid_f); 
  
-  SU3::HotConfiguration(RNG4,Umu);
+  SU<Nc>::HotConfiguration(RNG4,Umu);

  precisionChange(Umu_f,Umu);
  
--- a/tests/core/Test_cf_coarsen_support.cc
+++ b/tests/core/Test_cf_coarsen_support.cc
@@ -64,7 +64,7 @@ int main (int argc, char ** argv)
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);

  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
--- a/tests/core/Test_checker.cc
+++ b/tests/core/Test_checker.cc
@@ -131,7 +131,7 @@ int main (int argc, char ** argv)
  // LatticeFermion result(FGrid); result=Zero();
  // LatticeGaugeField Umu(UGrid); 

-  // SU3::HotConfiguration(RNG4,Umu);
+  // SU<Nc>::HotConfiguration(RNG4,Umu);

  // std::vector<LatticeColourMatrix> U(4,UGrid);
  // for(int mu=0;mu<Nd;mu++){
--- a/tests/core/Test_contfrac_even_odd.cc
+++ b/tests/core/Test_contfrac_even_odd.cc
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);

  RealD mass=0.1;
--- a/tests/core/Test_dwf_eofa_even_odd.cc
+++ b/tests/core/Test_dwf_eofa_even_odd.cc
@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
    LatticeFermion    ref   (FGrid); ref = Zero();
    LatticeFermion    tmp   (FGrid); tmp = Zero();
    LatticeFermion    err   (FGrid); err = Zero();
-    LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu);
+    LatticeGaugeField Umu   (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu);
    std::vector<LatticeColourMatrix> U(4,UGrid);

    // Only one non-zero (y)
--- a/tests/core/Test_dwf_even_odd.cc
+++ b/tests/core/Test_dwf_even_odd.cc
@@ -72,7 +72,7 @@ int main (int argc, char ** argv)
  LatticeFermion    ref(FGrid);    ref=Zero();
  LatticeFermion    tmp(FGrid);    tmp=Zero();
  LatticeFermion    err(FGrid);    tmp=Zero();
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);

  // Only one non-zero (y)
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -138,7 +138,7 @@ int main (int argc, char ** argv)

  LatticeGaugeFieldD Umu(&GRID);

-  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
  //  Umu=Zero();
  ////////////////////////////////////////////////////
  // Wilson test
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -73,11 +73,11 @@ int main (int argc, char ** argv)
  LatticeColourMatrix   xform2(&GRID); // Gauge xform
  LatticeColourMatrix   xform3(&GRID); // Gauge xform
  
-  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
  Uorg=Umu;
  Urnd=Umu;

-  SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge
+  SU<Nc>::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge

  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -121,7 +121,7 @@ int main (int argc, char ** argv)
  std::cout<< "* Testing non-unit configuration                                *" <<std::endl;
  std::cout<< "*****************************************************************" <<std::endl;

-  SU3::HotConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge

  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Initial plaquette "<<plaq << std::endl;
@@ -136,7 +136,7 @@ int main (int argc, char ** argv)
  std::cout<< "*****************************************************************" <<std::endl;

  Umu=Urnd;
-  SU3::HotConfiguration(pRNG,Umu); // Unit gauge
+  SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge

  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Initial plaquette "<<plaq << std::endl;
--- a/tests/core/Test_gparity.cc
+++ b/tests/core/Test_gparity.cc
@@ -114,7 +114,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG4_2f(UGrid_2f);  RNG4_2f.SeedFixedIntegers(seeds4);

  GparityGaugeField Umu_2f(UGrid_2f);
-  SU3::HotConfiguration(RNG4_2f,Umu_2f);
+  SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f);

  StandardFermionField    src   (FGrid_2f); 
  StandardFermionField    tmpsrc(FGrid_2f); 
--- a/tests/core/Test_gpwilson_even_odd.cc
+++ b/tests/core/Test_gpwilson_even_odd.cc
@@ -61,7 +61,7 @@ int main (int argc, char ** argv)
  FermionField    ref(&Grid);    ref=Zero();
  FermionField    tmp(&Grid);    tmp=Zero();
  FermionField    err(&Grid);    tmp=Zero();
-  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);

  double volume=1;
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@@ -66,7 +66,7 @@ int main(int argc, char** argv) {

  std::cout << GridLogMessage << "*********************************************"
            << std::endl;
-  std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl;
+  std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl;
  std::cout << GridLogMessage << "*********************************************"
            << std::endl;
  SU3::printGenerators();
@@ -114,8 +114,8 @@ int main(int argc, char** argv) {

  
  LatticeGaugeField U(grid), V(grid);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V);

  // Adjoint representation
  // Test group structure
@@ -123,8 +123,8 @@ int main(int argc, char** argv) {
  LatticeGaugeField UV(grid);
  UV = Zero();
  for (int mu = 0; mu < Nd; mu++) {
-    SU<Nc>::LatticeMatrix Umu = peekLorentz(U,mu);
-    SU<Nc>::LatticeMatrix Vmu = peekLorentz(V,mu);
+    SU3::LatticeMatrix Umu = peekLorentz(U,mu);
+    SU3::LatticeMatrix Vmu = peekLorentz(V,mu);
    pokeLorentz(UV,Umu*Vmu, mu);
  }

@@ -151,16 +151,16 @@ int main(int argc, char** argv) {

  // Check correspondence of algebra and group transformations
  // Create a random vector
-  SU<Nc>::LatticeAlgebraVector h_adj(grid);
+  SU3::LatticeAlgebraVector h_adj(grid);
  typename AdjointRep<Nc>::LatticeMatrix Ar(grid);
  random(gridRNG,h_adj);
  h_adj = real(h_adj);
  SU_Adjoint<Nc>::AdjointLieAlgebraMatrix(h_adj,Ar);

  // Re-extract h_adj
-  SU<Nc>::LatticeAlgebraVector h_adj2(grid);
+  SU3::LatticeAlgebraVector h_adj2(grid);
  SU_Adjoint<Nc>::projectOnAlgebra(h_adj2, Ar);
-  SU<Nc>::LatticeAlgebraVector h_diff = h_adj - h_adj2;
+  SU3::LatticeAlgebraVector h_diff = h_adj - h_adj2;
  std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl;

  // Exponentiate
@@ -183,14 +183,14 @@ int main(int argc, char** argv) {
      

  // Construct the fundamental matrix in the group
-  SU<Nc>::LatticeMatrix Af(grid);
-  SU<Nc>::FundamentalLieAlgebraMatrix(h_adj,Af);
-  SU<Nc>::LatticeMatrix Ufund(grid);
+  SU3::LatticeMatrix Af(grid);
+  SU3::FundamentalLieAlgebraMatrix(h_adj,Af);
+  SU3::LatticeMatrix Ufund(grid);
  Ufund  = expMat(Af, 1.0, 16);
  // Check unitarity
-  SU<Nc>::LatticeMatrix uno_f(grid);
+  SU3::LatticeMatrix uno_f(grid);
  uno_f = 1.0;
-  SU<Nc>::LatticeMatrix UnitCheck(grid);
+  SU3::LatticeMatrix UnitCheck(grid);
  UnitCheck = Ufund * adj(Ufund) - uno_f;
  std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck)
            << std::endl;
@@ -311,14 +311,14 @@ int main(int argc, char** argv) {
  // Test group structure
  // (U_f * V_f)_r = U_r * V_r
  LatticeGaugeField U2(grid), V2(grid);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
  
  LatticeGaugeField UV2(grid);
  UV2 = Zero();
  for (int mu = 0; mu < Nd; mu++) {
-    SU<Nc>::LatticeMatrix Umu2 = peekLorentz(U2,mu);
-    SU<Nc>::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
+    SU3::LatticeMatrix Umu2 = peekLorentz(U2,mu);
+    SU3::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
    pokeLorentz(UV2,Umu2*Vmu2, mu);
  }
  
@@ -345,16 +345,16 @@ int main(int argc, char** argv) {
  
  // Check correspondence of algebra and group transformations
  // Create a random vector
-  SU<Nc>::LatticeAlgebraVector h_sym(grid);
+  SU3::LatticeAlgebraVector h_sym(grid);
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ar_sym(grid);
  random(gridRNG,h_sym);
  h_sym = real(h_sym);
  SU_TwoIndex<Nc,Symmetric>::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym);
  
  // Re-extract h_sym
-  SU<Nc>::LatticeAlgebraVector h_sym2(grid);
+  SU3::LatticeAlgebraVector h_sym2(grid);
  SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym);
-  SU<Nc>::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
+  SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
  std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl;

  
@@ -379,11 +379,11 @@ int main(int argc, char** argv) {
  
  
  // Construct the fundamental matrix in the group
-  SU<Nc>::LatticeMatrix Af_sym(grid);
-  SU<Nc>::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
-  SU<Nc>::LatticeMatrix Ufund2(grid);
+  SU3::LatticeMatrix Af_sym(grid);
+  SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
+  SU3::LatticeMatrix Ufund2(grid);
  Ufund2  = expMat(Af_sym, 1.0, 16);
-  SU<Nc>::LatticeMatrix UnitCheck2(grid);
+  SU3::LatticeMatrix UnitCheck2(grid);
  UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f;
  std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2)
      << std::endl;
@@ -421,14 +421,14 @@ int main(int argc, char** argv) {
  // Test group structure
  // (U_f * V_f)_r = U_r * V_r
  LatticeGaugeField U2A(grid), V2A(grid);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
-  SU<Nc>::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
+  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
  
  LatticeGaugeField UV2A(grid);
  UV2A = Zero();
  for (int mu = 0; mu < Nd; mu++) {
-    SU<Nc>::LatticeMatrix Umu2A = peekLorentz(U2,mu);
-    SU<Nc>::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
+    SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu);
+    SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
    pokeLorentz(UV2A,Umu2A*Vmu2A, mu);
  }
  
@@ -455,16 +455,16 @@ int main(int argc, char** argv) {
  
  // Check correspondence of algebra and group transformations
  // Create a random vector
-  SU<Nc>::LatticeAlgebraVector h_Asym(grid);
+  SU3::LatticeAlgebraVector h_Asym(grid);
  typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid);
  random(gridRNG,h_Asym);
  h_Asym = real(h_Asym);
  SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym);
  
  // Re-extract h_sym
-  SU<Nc>::LatticeAlgebraVector h_Asym2(grid);
+  SU3::LatticeAlgebraVector h_Asym2(grid);
  SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym);
-  SU<Nc>::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
+  SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
  std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl;

  
@@ -489,11 +489,11 @@ int main(int argc, char** argv) {
  
  
  // Construct the fundamental matrix in the group
-  SU<Nc>::LatticeMatrix Af_Asym(grid);
-  SU<Nc>::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
-  SU<Nc>::LatticeMatrix Ufund2A(grid);
+  SU3::LatticeMatrix Af_Asym(grid);
+  SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
+  SU3::LatticeMatrix Ufund2A(grid);
  Ufund2A  = expMat(Af_Asym, 1.0, 16);
-  SU<Nc>::LatticeMatrix UnitCheck2A(grid);
+  SU3::LatticeMatrix UnitCheck2A(grid);
  UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f;
  std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A)
      << std::endl;
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -444,7 +444,7 @@ int main(int argc, char **argv) {
      // Lattice 12x12 GEMM
      scFooBar = scFoo * scBar;

-      // Benchmark some simple operations LatticeSU3 * Lattice SU3.
+      // Benchmark some simple operations LatticeSU<Nc> * Lattice SU<Nc>.
      double t0, t1, flops;
      double bytes;
      int ncall = 5000;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	3c23a947cc	Fixed test for very much non-unit det	2021-01-15 09:16:02 -05:00
Peter Boyle	56111bb823	Merge branch 'develop' into feature/conjugate-bc-dirs	2021-01-14 21:01:22 -05:00
Peter Boyle	99445673f6	Gparity fix, and plaquette IO	2021-01-14 21:00:36 -05:00
Peter Boyle	97a59643f7	Red black coarse space	2021-01-14 20:49:13 -05:00
Peter Boyle	579595f547	Red black on coarse space	2021-01-14 20:48:35 -05:00
Peter Boyle	281ac5fc12	Red black support on coars	2021-01-14 20:48:08 -05:00
Peter Boyle	d8fa903b02	G5 on coarse spaces	2021-01-14 20:47:28 -05:00
Peter Boyle	eaff0f3aeb	Gamma5 on coaree spaces	2021-01-14 20:46:58 -05:00
Peter Boyle	e8e20c01b2	Coarsened vector test	2021-01-14 20:46:21 -05:00
Peter Boyle	a4afc3ea2a	Red black coarse space	2021-01-14 20:44:16 -05:00
Peter Boyle	3fe75bc7cb	Merge pull request #329 from nmeyer-ur/feature/a64fx-3 Revised dslash/dwf kernels for A64FX	2020-12-20 08:17:15 -05:00
Nils Meyer	45d49d8648	clean up	2020-12-19 03:35:18 +01:00
Nils Meyer	6013183361	removed Asm impls	2020-12-19 03:25:01 +01:00
Nils Meyer	4b882e8056	fixed lost bracket	2020-12-19 03:09:20 +01:00
Nils Meyer	3f9ae6e7e7	Merge branch 'develop' into feature/a64fx-3	2020-12-19 02:37:11 +01:00
Nils Meyer	909acd55cd	vnum variant for prefetches	2020-12-19 02:00:22 +01:00
Nils Meyer	4dd9e39e0d	up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1	2020-12-19 00:54:31 +01:00
Peter Boyle	7adb253e25	Merge pull request #328 from mmphys/feature/mrespatch Enable existing conserved current code for CUDA	2020-12-17 11:10:29 -05:00
Michael Marshall	873519e960	Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration	2020-12-14 16:06:10 +00:00
Peter Boyle	9aec4a3c26	SYCL	2020-12-10 02:11:17 -08:00
Peter Boyle	70510d151b	Merge pull request #327 from paboyle/feature/gparity_twist_GPU Feature/gparity twist gpu	2020-12-07 12:02:20 -05:00
Christopher Kelly	9e7bacb5a4	Merge branch 'develop' into feature/gparity_twist_GPU	2020-12-07 11:55:39 -05:00
Christopher Kelly	2ef1fa66a8	Improved performance of G-parity kernel for GPUs by simplifying multLink implementation	2020-12-07 11:53:35 -05:00
Peter Boyle	cf76741ec6	Intel DPCPP Gold happy now (compiles all, runs Benchmark_dwf_fp32 )	2020-12-03 03:47:11 -08:00
Peter Boyle	497e7c1c40	Duplicate code	2020-12-02 17:55:30 -08:00
Peter Boyle	888eacd3b8	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-11-24 21:46:33 -05:00
Peter Boyle	321f0f51b5	Project to SU(N)	2020-11-24 21:46:10 -05:00
Peter Boyle	30ad9578a2	Merge branch 'lehner-feature/gpt' into develop	2020-11-24 06:10:24 -05:00
Peter Boyle	9dce101586	Merge branch 'feature/gpt' of https://github.com/lehner/Grid into lehner-feature/gpt	2020-11-24 06:10:16 -05:00
Peter Boyle	97e264d0ff	Christoph's changes	2020-11-23 15:46:11 +00:00
Peter Boyle	683a5e5bf5	Stencil use host vector for integera table on enable-shared=no and mirror it on device	2020-11-23 15:39:51 +00:00
Peter Boyle	d4861a362c	Stencil use non-UVM memory for look up table on enable-shared=no	2020-11-23 15:38:49 +00:00
Peter Boyle	5ff3eae027	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-11-20 13:14:44 -05:00
Peter Boyle	147dc15d26	Update	2020-11-20 13:13:59 -05:00
Christoph Lehner	c61ea72949	Merge pull request #19 from paboyle/develop Sync	2020-11-20 17:31:13 +01:00
Peter Boyle	86e8b9fe38	ALLOC_ALIGN removed	2020-11-20 17:07:16 +01:00
Peter Boyle	612e468889	Configurable ALLOC_ALIGN and ALLOC_CACHE	2020-11-20 16:48:28 +01:00
Christoph Lehner	4ea8d128c2	Merge pull request #18 from paboyle/develop Sync	2020-11-20 15:36:50 +01:00
Peter Boyle	e49b7f2f88	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-11-19 19:24:41 +01:00
Peter Boyle	aace3d47b9	partial work in progress	2020-11-19 19:24:14 +01:00
Peter Boyle	d5049949a4	Starting to fix reunitarise	2020-11-19 19:23:41 +01:00
Peter Boyle	f1c7480e3c	Warning remove	2020-11-19 19:23:03 +01:00
Peter Boyle	5adae5d6ff	Unused variable remove	2020-11-19 19:22:12 +01:00
Peter Boyle	a8412ace05	Merge pull request #317 from i-kanamori/develop adding an error check for input: Parameters.StartingType	2020-11-18 23:09:40 -05:00
Peter Boyle	9fd1c2ad4b	Merge pull request #325 from DanielRichtmann/feature/threaded-clover-inversion Threaded clover term inversion	2020-11-18 23:08:37 -05:00
Peter Boyle	4cf3575353	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-11-18 03:07:36 +00:00
Peter Boyle	804a810d68	Wildcard mismatch	2020-11-18 03:06:53 +00:00
Peter Boyle	8fcb392e24	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-11-17 04:51:31 -08:00
Peter Boyle	dd8d70eeff	Build without LIME	2020-11-17 04:41:15 -08:00
Peter Boyle	aa8aba6543	--shm-force-mpi	2020-11-16 20:15:50 -05:00
Peter Boyle	13df14f96e	Switch off SHM paths with --disable-shm	2020-11-16 18:07:15 -05:00
Peter Boyle	3aab983760	Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency)	2020-11-16 17:13:58 +01:00
Peter Boyle	9c4dcc5ea3	Merge branch 'master' into develop	2020-11-16 16:34:57 +01:00
Peter Boyle	a1063ddbb9	Update options and simplify	2020-11-13 04:11:03 +01:00
Peter Boyle	18ef8056ec	Hide Shared Memory	2020-11-13 04:10:40 +01:00
Peter Boyle	1c673977fa	Must ask for COMMMS_THREADS	2020-11-13 03:59:36 +01:00
Peter Boyle	e9bc748828	Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel	2020-11-13 03:58:34 +01:00
Peter Boyle	f48156529b	Work on 2,2,2,8 ranks	2020-11-13 03:57:58 +01:00
Peter Boyle	d05ce01809	TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED	2020-11-13 03:52:19 +01:00
Peter Boyle	cf23eff60e	Device to Device, Memset, cannot assume UVM == Communicable	2020-11-13 03:51:08 +01:00
Peter Boyle	6e313575be	Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac	2020-11-13 03:50:16 +01:00
Peter Boyle	b13d1f7238	TOFU compat flag to help Isaaku	2020-11-13 03:49:44 +01:00
Peter Boyle	b5e7945dd9	Option for host or device Cshift implementation	2020-11-13 01:38:54 +01:00
Peter Boyle	7535566f54	Option for bounce through the SHM buffer	2020-11-12 22:54:27 +01:00
Peter Boyle	50b808ab33	Configure option between host and device	2020-11-12 22:28:12 +01:00
Peter Boyle	f16c2665f5	Host memory explict	2020-11-12 20:29:58 +01:00
Peter Boyle	41e28015ae	Volume divisible guarantee	2020-11-07 13:32:16 +01:00
Peter Boyle	a0ccbb3bd6	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-11-01 01:16:35 +00:00
Peter Boyle	5eeabaa2bb	HIP fix	2020-11-01 01:16:01 +00:00
Peter Boyle	00d0d6d008	Hip Free managed	2020-10-31 18:14:31 -04:00
Peter Boyle	537a9f7030	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-10-31 18:13:30 -04:00
Peter Boyle	cc9c993f74	Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt	2020-10-31 18:12:47 -04:00
Peter Boyle	d10422ded8	Test project on group	2020-10-31 18:12:30 -04:00
Peter Boyle	f313565a3c	HiP compile	2020-10-31 12:12:40 +00:00
Daniel Richtmann	b3881d2636	Thread inversion of clover term	2020-10-30 16:18:58 +01:00
Antonin Portelli	61d5860b46	Merge pull request #318 from rrhodgson/feature/BaryonSpinMat Added untraced baryon contraction code	2020-10-28 18:39:59 +00:00
Raoul Hodgson	52d17987dc	BaryonUtils.h updated debug output	2020-10-23 11:41:08 +01:00
Raoul Hodgson	19d8bba97d	BaryonUtils function naming change	2020-10-21 11:58:53 +01:00
Raoul Hodgson	463d72d322	Added untraced baryon contraction code	2020-10-19 16:13:28 +01:00
KANAMORI Issaku	d060341168	add an error check for Parameters.StartingType	2020-10-16 21:39:17 +09:00
KANAMORI Issaku	c772bcd514	Merge https://github.com/paboyle/Grid into develop	2020-10-16 20:30:32 +09:00
Peter Boyle	3362f8dfa0	happy compile	2020-10-14 22:59:41 -04:00
Peter Boyle	bf3c9857e0	Closure changes	2020-10-14 21:37:14 -04:00
Peter Boyle	a88b3ceca5	Closure cases	2020-10-14 21:33:51 -04:00
Peter Boyle	aa135412f5	toComplex, toReal	2020-10-13 22:25:01 -04:00
Peter Boyle	9945399e60	Reaality issues fix by drop from ET	2020-10-13 22:24:32 -04:00
Peter Boyle	5eeffa49e8	Reality forced included	2020-10-13 22:23:57 -04:00
Peter Boyle	3f06209720	Pretty print	2020-10-13 22:18:51 -04:00
Peter Boyle	12e239dd9f	Merge branch 'release/dirac-ITT-2020'	2020-10-13 13:38:29 -04:00
Peter Boyle	af2301afbb	Merge pull request #312 from i-kanamori/debug_512 add reordring of random number generators in IO	2020-10-13 11:42:12 -04:00
Peter Boyle	f98856a26f	Merge pull request #314 from smangham/issue_readme_precision Fix for deprecated configure options in documentation (issue #313)	2020-10-13 11:41:38 -04:00
Sam Mangham	d55cc5b380	Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision.	2020-10-12 12:33:13 +01:00
Antonin Portelli	c2b688abc9	Benchmark_IO: reducing max local volume to 32^4	2020-10-10 16:52:56 +01:00
Antonin Portelli	b0d61b9687	Benchmark_IO cleaner output	2020-10-09 21:46:45 +01:00
Antonin Portelli	5f893bf9af	Benchmark_IO procurement sizes	2020-10-09 21:31:59 +01:00
Antonin Portelli	0e17bd6597	I/O benchmark cleanup	2020-10-09 20:29:57 +01:00
Antonin Portelli	22caa158cc	multi-pass I/O benchmark, with statistic and robustness summary	2020-10-09 20:29:40 +01:00
Antonin Portelli	b24a504d7c	hook to access last parallel I/O performance measurement	2020-10-09 20:28:54 +01:00
Peter Boyle	992ef6e9fc	more runtime	2020-10-08 22:19:20 -04:00
Peter Boyle	f32a320bc3	Single prec benchmark in double prec compile	2020-10-08 19:52:08 -04:00
Peter Boyle	5f0fe029d2	Improve meemory benchmarks for GPU (avoid host mem ping pong)	2020-10-08 19:51:28 -04:00
Antonin Portelli	6b1486e89b	fixing number of colours defaulting to 4 in most cases	2020-10-08 16:31:24 +01:00
Peter Boyle	3f9c427a3a	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-10-07 13:12:57 -04:00
Peter Boyle	d201277652	Expose Nc as a compile time configure option. Remove precision option	2020-10-07 13:07:00 -04:00
Antonin Portelli	fdda7cf9cf	Merge branch 'feature/benchmark-io-update' into develop	2020-10-07 15:57:53 +01:00
Antonin Portelli	e22d30f715	Merge branch 'develop' into feature/benchmark-io-update	2020-10-07 15:56:39 +01:00
KANAMORI Issaku	467deee46f	Merge branch 'debug_512' into develop	2020-10-07 15:18:44 +09:00
Peter Boyle	35a69a5133	SU4 x SU4	2020-10-06 21:48:35 -04:00
KANAMORI Issaku	97db2b8d20	add reordring of random number generator in IO	2020-10-06 17:25:59 +09:00
Christoph Lehner	80fd6ab407	Merge pull request #17 from paboyle/develop sync upstream	2020-10-06 09:01:39 +02:00
Christoph Lehner	5534921bee	Merge pull request #16 from DanielRichtmann/feature/gpt-coarsenedmatrix Enable checkerboard operations for CoarsenedMatrix	2020-10-01 10:55:13 +02:00
Christoph Lehner	5cffa05c7e	remove slab allocator file	2020-09-13 14:06:25 -04:00
Christoph Lehner	d50a2164d7	remove slab allocator	2020-09-13 14:06:06 -04:00
Christoph Lehner	32ff766dbd	fix evict scheme, slab alloc	2020-09-13 14:02:53 -04:00
Christoph Lehner	01652d8cfe	SlabAllocator	2020-09-13 05:56:02 -04:00
Daniel Richtmann	4d2dc7ba03	Enable even-odd for CoarsenedMatrix	2020-09-11 20:32:02 +02:00
Christoph Lehner	51d1beb1f3	Merge pull request #15 from paboyle/develop Sync with upstream	2020-09-07 14:20:33 +02:00
Christoph Lehner	249e2db87d	Merge pull request #14 from DanielRichtmann/feature/gpt-coarsenedmatrix Expose more functions in CMat	2020-08-27 15:18:56 +02:00
Daniel Richtmann	cf3535d16e	Expose more functions in CMat	2020-08-27 14:06:48 +02:00
Christoph Lehner	d61ee817f4	Merge pull request #13 from DanielRichtmann/feature/gpt-coarsenedmatrix Changes needed for GPT MG	2020-08-27 12:11:06 +02:00
Christoph Lehner	2a75516330	state MPI/SLURM message only on world_rank zero	2020-08-26 12:34:17 -04:00
Daniel Richtmann	b2087f14c4	Fix CoarsenedMatrix regarding illegal memory accesses Need a reference to geom since the lambda copies the this pointer which points to host memory, see - https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture - https://devblogs.nvidia.com/new-compiler-features-cuda-8/	2020-08-24 17:46:47 +02:00
Daniel Richtmann	dd1ba266b2	Fix mapping between dir + disp and point in CMat	2020-08-24 17:46:46 +02:00
Daniel Richtmann	1292d59563	Add a typedef + broaden interface of CMat	2020-08-24 17:46:45 +02:00
Christoph Lehner	9877ed9bf8	Merge pull request #12 from paboyle/develop Sync	2020-08-22 16:35:35 +02:00
Christoph Lehner	f0dc0f3621	fix compile issue on Qpace3	2020-08-22 13:57:33 +02:00
Christoph Lehner	63b0a19f37	Merge pull request #11 from paboyle/develop Sync	2020-08-20 20:53:39 +02:00