Merge branch 'release/dirac-ITT-2020'

Merge pull request #312 from i-kanamori/debug_512
add reordring of random number generators in IO
2026-06-25 21:13:30 +01:00 · 2020-10-13 13:38:29 -04:00 · 2020-10-13 11:42:12 -04:00 · 2020-10-13 11:41:38 -04:00 · 2020-10-12 12:33:13 +01:00 · 2020-10-10 16:52:56 +01:00
312 changed files with 19341 additions and 3943 deletions
@@ -9,11 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=single
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=double
 before_install:
    - export GRIDDIR=`pwd`
@@ -55,7 +50,7 @@ script:
    - make -j4
    - make install
    - cd $CWD/build
-    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
+    - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
@@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
-#include <Grid/allocator/AlignedAllocator.h>
+#include <Grid/allocator/Allocator.h>
 #include <Grid/simd/Simd.h>
-#include <Grid/threads/Threads.h>
+#include <Grid/threads/ThreadReduction.h>
 #include <Grid/serialisation/Serialisation.h>
 #include <Grid/util/Sha.h>
 #include <Grid/communicator/Communicator.h> 
@@ -6,6 +6,7 @@
 ///////////////////
 #include <cassert>
 #include <complex>
 #include <memory>
 #include <vector>
 #include <array>
 #include <string>
@@ -18,21 +18,28 @@
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
 #undef __CUDA_ARCH__
 #undef __NVCC__
 #undef __CUDACC__
 #undef __CUDA_ARCH__
 #define __NVCC__REDEFINE__
 #endif 
 /* SYCL save and restore compile environment*/
-#ifdef __SYCL_DEVICE_ONLY__  
+#ifdef GRID_SYCL
 #pragma push
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #undef EIGEN_USE_SYCL
 #define EIGEN_DONT_VECTORIZE
 //#undef EIGEN_USE_SYCL
 #define __SYCL__REDEFINE__
 #endif
 /* HIP save and restore compile environment*/
 #ifdef GRID_HIP
 #pragma push
 #pragma push_macro("__HIP_DEVICE_COMPILE__")
 #endif
 #define EIGEN_NO_HIP
 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -51,6 +58,12 @@
 #pragma pop
 #endif
 /*HIP restore*/
 #ifdef __HIP__REDEFINE__
 #pragma pop_macro("__HIP_DEVICE_COMPILE__")
 #pragma pop
 #endif
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -21,7 +21,7 @@ if BUILD_HDF5
  extra_headers+=serialisation/Hdf5Type.h
 endif
-all: version-cache
+all: version-cache Version.h
 version-cache:
 	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@@ -42,7 +42,7 @@ version-cache:
 	fi;\
 	rm -f vertmp
-Version.h:
+Version.h: version-cache
 	cp version-cache Version.h
 .PHONY: version-cache
@@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H
 NAMESPACE_CHECK(algorithms);
 #include <Grid/algorithms/SparseMatrix.h>
 #include <Grid/algorithms/LinearOperator.h>
 #include <Grid/algorithms/Preconditioner.h>
 NAMESPACE_CHECK(SparseMatrix);
 #include <Grid/algorithms/approx/Zolotarev.h>
 #include <Grid/algorithms/approx/Chebyshev.h>
@@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/Forecast.h>
 #include <Grid/algorithms/approx/RemezGeneral.h>
 #include <Grid/algorithms/approx/ZMobius.h>
-
+NAMESPACE_CHECK(approx);
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 NAMESPACE_CHECK(ConjGrad);
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
@@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 NAMESPACE_CHECK(PowerMethod);
 #include <Grid/algorithms/CoarsenedMatrix.h>
 NAMESPACE_CHECK(CoarsendMatrix);
 #include <Grid/algorithms/FFT.h>
 #endif
@@ -1,14 +1,3 @@
    // blockZaxpy in bockPromote - 3s, 5%
    // noncoalesced linalg in Preconditionoer ~ 3s 5%
    // Lancos tuning or replace 10-20s ~ 25%, open ended
    // setup tuning   5s  ~  8%
    //    -- e.g. ordermin, orderstep tunables.
    // MdagM path without norm in LinOp code.     few seconds
    // Mdir calc blocking kernels
    // Fuse kernels in blockMaskedInnerProduct
    // preallocate Vectors in Cayley 5D ~ few percent few seconds
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -91,35 +80,8 @@ public:
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
    //// report back
    std::cout<<GridLogMessage<<"directions    :";
    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
    std::cout<<std::endl;
    std::cout<<GridLogMessage<<"displacements :";
    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
    std::cout<<std::endl;
  }
  /*
  // Original cleaner code
  Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
  for(int d=0;d<dimension;d++){
  directions[2*d  ] = d;
  directions[2*d+1] = d;
  displacements[2*d  ] = +1;
  displacements[2*d+1] = -1;
  }
  directions   [2*dimension]=0;
  displacements[2*dimension]=0;
  }
  std::vector<int> GetDelta(int point) {
  std::vector<int> delta(dimension,0);
  delta[directions[point]] = displacements[point];
  return delta;
  };
  */    
 };
 template<class Fobj,class CComplex,int nbasis>
@@ -149,24 +111,6 @@ public:
    CoarseScalar InnerProd(CoarseGrid); 
    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
    //    blockOrthogonalise(InnerProd,subspace);
    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
    //      CheckOrthogonal();
  } 
  void CheckOrthogonal(void){
    CoarseVector iProj(CoarseGrid); 
    CoarseVector eProj(CoarseGrid); 
    for(int i=0;i<nbasis;i++){
      blockProject(iProj,subspace[i],subspace);
      eProj=Zero(); 
      accelerator_for(ss, CoarseGrid->oSites(),1,{
 	eProj[ss](i)=CComplex(1.0);
      });
      eProj=eProj - iProj;
      std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
    }
    std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
  } 
  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
    blockProject(CoarseVec,FineVec,subspace);
@@ -175,11 +119,6 @@ public:
    FineVec.Checkerboard() = subspace[0].Checkerboard();
    blockPromote(CoarseVec,FineVec,subspace);
  }
  void CreateSubspaceRandom(GridParallelRNG &RNG){
    for(int i=0;i<nbasis;i++){
      random(RNG,subspace[i]);
    }
  }
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
@@ -218,7 +157,7 @@ public:
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
  ////////////////////////////////////////////////////////////////////////////////////////////////
-#if 1
+
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
@@ -280,10 +219,10 @@ public:
 	hermop.HermOp(*Tn,y);
-	auto y_v = y.View();
+	autoView( y_v , y, AcceleratorWrite);
-	auto Tn_v = Tn->View();
+	autoView( Tn_v , (*Tn), AcceleratorWrite);
-	auto Tnp_v = Tnp->View();
+	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
-	auto Tnm_v = Tnm->View();
+	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
 	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
@@ -313,201 +252,6 @@ public:
    }
    assert(b==nn);
  }
 #endif
 #if 0
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter,
 				       int ordermin,
 				       int orderstep,
 				       double filterlo
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    FineField combined(FineGrid);
    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    // Initial matrix element
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
 #define FILTERb(llo,hhi,oorder)						\
    {									\
      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
      Cheb(hermop,noise,Mn);						\
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
      subspace[b]   = Mn;						\
      hermop.Op(Mn,tmp);						\
      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
      b++;								\
    }									
    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
    RealD alpha=-0.8;
    RealD beta =-0.8;
 #define FILTER(llo,hhi,oorder)						\
    {									\
      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
      Cheb(hermop,noise,Mn);						\
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
      subspace[b]   = Mn;						\
      hermop.Op(Mn,tmp);						\
      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
      b++;								\
    }									
 #define FILTERc(llo,hhi,oorder)				\
    {							\
      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
      Cheb(hermop,noise,combined);			\
    }									
    double node = 0.000;
    FILTERb(lo,hi,orderfilter);// 0
    //    FILTERc(node,hi,51);// 0
    noise = Mn;
    int base = 0;
    int mult = 100;
    FILTER(node,hi,base+1*mult);
    FILTER(node,hi,base+2*mult);
    FILTER(node,hi,base+3*mult);
    FILTER(node,hi,base+4*mult);
    FILTER(node,hi,base+5*mult);
    FILTER(node,hi,base+6*mult);
    FILTER(node,hi,base+7*mult);
    FILTER(node,hi,base+8*mult);
    FILTER(node,hi,base+9*mult);
    FILTER(node,hi,base+10*mult);
    FILTER(node,hi,base+11*mult);
    FILTER(node,hi,base+12*mult);
    FILTER(node,hi,base+13*mult);
    FILTER(node,hi,base+14*mult);
    FILTER(node,hi,base+15*mult);
    assert(b==nn);
  }
 #endif
 #if 0
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
 				       int orderfilter,
 				       int ordermin,
 				       int orderstep,
 				       double filterlo
 				       ) {
    RealD scale;
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
    FineField tmp(FineGrid);
    FineField combined(FineGrid);
    // New normalised noise
    gaussian(RNG,noise);
    scale = std::pow(norm2(noise),-0.5); 
    noise=noise*scale;
    // Initial matrix element
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
    int b =0;
    {						
      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
      JacobiPoly(hermop,noise,Mn);
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
      subspace[b]   = Mn;
      hermop.Op(Mn,tmp);
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
      b++;
      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
      //      subspace[b]   = tmp;      b++;
      //    }									
    }									
 #define FILTER(lambda)						\
    {								\
      hermop.HermOp(subspace[0],tmp);				\
      tmp = tmp - lambda *subspace[0];				\
      scale = std::pow(norm2(tmp),-0.5);			\
      tmp=tmp*scale;							\
      subspace[b]   = tmp;						\
      hermop.Op(subspace[b],tmp);					\
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
      b++;								\
    }									
    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
    //      subspace[b]   = tmp;      b++;
    //    }									
    FILTER(2.0e-5);
    FILTER(2.0e-4);
    FILTER(4.0e-4);
    FILTER(8.0e-4);
    FILTER(8.0e-4);
    FILTER(2.0e-3);
    FILTER(3.0e-3);
    FILTER(4.0e-3);
    FILTER(5.0e-3);
    FILTER(6.0e-3);
    FILTER(2.5e-3);
    FILTER(3.5e-3);
    FILTER(4.5e-3);
    FILTER(5.5e-3);
    FILTER(6.5e-3);
    //    FILTER(6.0e-5);//6
    //    FILTER(7.0e-5);//8
    //    FILTER(8.0e-5);//9
    //    FILTER(9.0e-5);//3
    /*
    //    FILTER(1.0e-4);//10
    FILTER(2.0e-4);//11
    //   FILTER(3.0e-4);//12
    //    FILTER(4.0e-4);//13
    FILTER(5.0e-4);//14
    FILTER(6.0e-3);//4
    FILTER(7.0e-4);//1
    FILTER(8.0e-4);//7
    FILTER(9.0e-4);//15
    FILTER(1.0e-3);//2
    FILTER(2.0e-3);//2
    FILTER(3.0e-3);//2
    FILTER(4.0e-3);//2
    FILTER(5.0e-3);//2
    FILTER(6.0e-3);//2
    FILTER(7.0e-3);//2
    FILTER(8.0e-3);//2
    FILTER(1.0e-2);//2
    */
    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
    assert(b==nn);
  }
 #endif
 };
@@ -549,13 +293,13 @@ public:
    SimpleCompressor<siteVector> compressor;
    Stencil.HaloExchange(in,compressor);
-
+    autoView( in_v , in, AcceleratorRead);
-    auto in_v = in.View();
+    autoView( out_v , out, AcceleratorWrite);
    auto out_v = out.View();
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+  
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];
    const int Nsimd = CComplex::Nsimd();
@@ -572,24 +316,25 @@ public:
      int ptype;
      StencilEntry *SE;
      int lane=SIMTlane(Nsimd);
      for(int point=0;point<geom.npoint;point++){
 	SE=Stencil.GetEntry(ptype,point,ss);
 	if(SE->_is_local) { 
-	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
 	}
-	synchronise();
+	acceleratorSynchronise();
 	for(int bb=0;bb<nbasis;bb++) {
 	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
      }
-      coalescedWrite(out_v[ss](b),res,lane);
+      coalescedWrite(out_v[ss](b),res);
      });
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  };
  void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -617,11 +362,11 @@ public:
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
-    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
    Aview *Aview_p = & AcceleratorViewContainer[0];
-    auto out_v = out.View();
+    autoView( out_v , out, AcceleratorWrite);
-    auto in_v  = in.View();
+    autoView( in_v  , in, AcceleratorRead);
    const int Nsimd = CComplex::Nsimd();
    typedef decltype(coalescedRead(in_v[0])) calcVector;
@@ -635,45 +380,21 @@ public:
      int ptype;
      StencilEntry *SE;
      int lane=SIMTlane(Nsimd);
      SE=Stencil.GetEntry(ptype,point,ss);
      if(SE->_is_local) { 
-	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
      } else {
-	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
      }
-      synchronise();
+      acceleratorSynchronise();
      for(int bb=0;bb<nbasis;bb++) {
 	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
      }
-      coalescedWrite(out_v[ss](b),res,lane);
+      coalescedWrite(out_v[ss](b),res);
    });
-#if 0
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
    accelerator_for(ss,Grid()->oSites(),1,{
      siteVector res = Zero();
      siteVector nbr;
      int ptype;
      StencilEntry *SE;
      SE=Stencil.GetEntry(ptype,point,ss);
      if(SE->_is_local&&SE->_permute) {
 	permute(nbr,in_v[SE->_offset],ptype);
      } else if(SE->_is_local) {
 	nbr = in_v[SE->_offset];
      } else {
 	nbr = Stencil.CommBuf()[SE->_offset];
      }
      synchronise();
      res = res + Aview_p[point][ss]*nbr;
      out_v[ss]=res;
    });
 #endif
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@@ -841,10 +562,10 @@ public:
 	    blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
-	    auto iZProj_v = iZProj.View() ;
+	    autoView( iZProj_v , iZProj, AcceleratorRead) ;
-	    auto oZProj_v = oZProj.View() ;
+	    autoView( oZProj_v , oZProj, AcceleratorRead) ;
-	    auto A_p     =  A[p].View();
+	    autoView( A_p     ,  A[p], AcceleratorWrite);
-	    auto A_self  = A[self_stencil].View();
+	    autoView( A_self  , A[self_stencil], AcceleratorWrite);
 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
@@ -860,11 +581,11 @@ public:
 	mult(tmp,phi,oddmask );  linop.Op(tmp,Mphio);
 	{
-	  auto tmp_      = tmp.View();
+	  autoView( tmp_      , tmp, AcceleratorWrite);
-	  auto evenmask_ = evenmask.View();
+	  autoView( evenmask_ , evenmask, AcceleratorRead);
-	  auto oddmask_  =  oddmask.View();
+	  autoView( oddmask_  ,  oddmask, AcceleratorRead);
-	  auto Mphie_    =  Mphie.View();
+	  autoView( Mphie_    ,  Mphie, AcceleratorRead);
-	  auto Mphio_    =  Mphio.View();
+	  autoView( Mphio_    ,  Mphio, AcceleratorRead);
 	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
 	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
 	    });
@@ -872,8 +593,8 @@ public:
 	blockProject(SelfProj,tmp,Subspace.subspace);
-	auto SelfProj_ = SelfProj.View();
+	autoView( SelfProj_ , SelfProj, AcceleratorRead);
-	auto A_self  = A[self_stencil].View();
+	autoView( A_self  , A[self_stencil], AcceleratorWrite);
 	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
 	  for(int j=0;j<nbasis;j++){
@@ -887,33 +608,8 @@ public:
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
      ForceHermitian();
    }
      // AssertHermitian();
      // ForceDiagonal();
  }
 #if 0
    ///////////////////////////
    // test code worth preserving in if block
    ///////////////////////////
    std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
    for(int p=0;p<geom.npoint;p++){
      std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
      std::cout<<GridLogMessage<< A[p] << std::endl;
    }
    std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
    phi=Subspace.subspace[0];
    std::vector<int> bc(FineGrid->_ndimension,0);
    blockPick(Grid(),phi,tmp,bc);      // Pick out a block
    linop.Op(tmp,Mphi);                // Apply big dop
    blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
    std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
    std::cout<<GridLogMessage<< iProj <<std::endl;
    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
  void ForceHermitian(void) {
    CoarseMatrix Diff  (Grid());
    for(int p=0;p<geom.npoint;p++){
@@ -933,27 +629,6 @@ public:
      }
    }
  }
  void AssertHermitian(void) {
    CoarseMatrix AA    (Grid());
    CoarseMatrix AAc   (Grid());
    CoarseMatrix Diff  (Grid());
    for(int d=0;d<4;d++){
      int dd=d+1;
      AAc = Cshift(A[2*d+1],dd,1);
      AA  = A[2*d];
      Diff = AA - adj(AAc);
      std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
      std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
    }
    Diff = A[8] - adj(A[8]);
    std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
    std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
  }
 };
 NAMESPACE_END(Grid);
@@ -1,4 +1,3 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 template<class scalar> struct FFTW { };
@@ -191,7 +189,7 @@ public:
    typedef typename sobj::scalar_type   scalar;
    Lattice<sobj> pgbuf(&pencil_g);
-    auto pgbuf_v = pgbuf.View();
+    autoView(pgbuf_v , pgbuf, CpuWrite);
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
@@ -232,15 +230,18 @@ public:
    result = source;
    int pc = processor_coor[dim];
    for(int p=0;p<processors[dim];p++) {
      {
 	autoView(r_v,result,CpuRead);
 	autoView(p_v,pgbuf,CpuWrite);
 	thread_for(idx, sgrid->lSites(),{
          Coordinate cbuf(Nd);
          sobj s;
 	  sgrid->LocalIndexToLocalCoor(idx,cbuf);
-	  peekLocalSite(s,result,cbuf);
+	  peekLocalSite(s,r_v,cbuf);
 	  cbuf[dim]+=((pc+p) % processors[dim])*L;
-	  //            cbuf[dim]+=p*L;
+	  pokeLocalSite(s,p_v,cbuf);
 	  pokeLocalSite(s,pgbuf,cbuf);
        });
      }
      if (p != processors[dim] - 1) {
 	result = Cshift(result,dim,L);
      }
@@ -269,15 +270,19 @@ public:
    flops+= flops_call*NN;
    // writing out result
    {
      autoView(pgbuf_v,pgbuf,CpuRead);
      autoView(result_v,result,CpuWrite);
      thread_for(idx,sgrid->lSites(),{
 	Coordinate clbuf(Nd), cgbuf(Nd);
 	sobj s;
 	sgrid->LocalIndexToLocalCoor(idx,clbuf);
 	cgbuf = clbuf;
 	cgbuf[dim] = clbuf[dim]+L*pc;
-	peekLocalSite(s,pgbuf,cgbuf);
+	peekLocalSite(s,pgbuf_v,cgbuf);
-	pokeLocalSite(s,result,clbuf);
+	pokeLocalSite(s,result_v,clbuf);
      });
    }
    result = result*div;
    // destroying plan
@@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
        LinearCombTimer.Start();
        bo = beta * omega;
-        auto p_v = p.View();
+	{
-        auto r_v = r.View();
+	  autoView( p_v , p, AcceleratorWrite);
-        auto v_v = v.View();
+	  autoView( r_v , r, AcceleratorRead);
 	  autoView( v_v , v, AcceleratorRead);
 	  accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
 	      coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
 	    });
 	}
        LinearCombTimer.Stop();
        LinalgTimer.Stop();
@@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
        alpha = rho / Calpha.real();
        LinearCombTimer.Start();
-        auto h_v = h.View();
+	{
-        auto psi_v = psi.View();
+	  autoView( p_v , p, AcceleratorRead);
 	  autoView( r_v , r, AcceleratorRead);
 	  autoView( v_v , v, AcceleratorRead);
 	  autoView( psi_v,psi, AcceleratorRead);
 	  autoView( h_v  ,  h, AcceleratorWrite);
 	  autoView( s_v  ,  s, AcceleratorWrite);
 	  accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
 	      coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
 	    });
        auto s_v = s.View();
 	  accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
 	      coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
 	  });
        }
        LinearCombTimer.Stop();
        LinalgTimer.Stop();
@@ -166,11 +172,17 @@ class BiCGSTAB : public OperatorFunction<Field>
        omega = Comega.real() / norm2(t);
        LinearCombTimer.Start();
-        auto t_v = t.View();
+	{
 	  autoView( psi_v,psi, AcceleratorWrite);
 	  autoView( r_v , r, AcceleratorWrite);
 	  autoView( h_v , h, AcceleratorRead);
 	  autoView( s_v , s, AcceleratorRead);
 	  autoView( t_v , t, AcceleratorRead);
 	  accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
 	      coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
 	      coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
 	    });
 	}
        LinearCombTimer.Stop();
        cp = norm2(r);
@@ -140,13 +140,15 @@ public:
      b = cp / c;
      LinearCombTimer.Start();
-      auto psi_v = psi.View();
+      {
-      auto p_v   = p.View();
+	autoView( psi_v , psi, AcceleratorWrite);
-      auto r_v   = r.View();
+	autoView( p_v   , p,   AcceleratorWrite);
 	autoView( r_v   , r,   AcceleratorWrite);
 	accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
 	    coalescedWrite(psi_v[ss], a      *  p_v(ss) + psi_v(ss));
 	    coalescedWrite(p_v[ss]  , b      *  p_v(ss) + r_v  (ss));
 	});
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
@@ -0,0 +1,241 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_PREC_GCR_NON_HERM_H
 #define GRID_PREC_GCR_NON_HERM_H
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 //VPGCR Abe and Zhang, 2005.
 //INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
 //Computing and Information Volume 2, Number 2, Pages 147-161
 //NB. Likely not original reference since they are focussing on a preconditioner variant.
 //    but VPGCR was nicely written up in their paper
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 #define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
  int mmax;
  int nstep;
  int steps;
  int level;
  GridStopWatch PrecTimer;
  GridStopWatch MatTimer;
  GridStopWatch LinalgTimer;
  LinearFunction<Field>     &Preconditioner;
  LinearOperatorBase<Field> &Linop;
  void Level(int lv) { level=lv; };
  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
    Tolerance(tol), 
    MaxIterations(maxit),
    Linop(_Linop),
    Preconditioner(Prec),
    mmax(_mmax),
    nstep(_nstep)
  { 
    level=1;
    verbose=1;
  };
  void operator() (const Field &src, Field &psi){
    psi=Zero();
    RealD cp, ssq,rsq;
    ssq=norm2(src);
    rsq=Tolerance*Tolerance*ssq;
    Field r(src.Grid());
    PrecTimer.Reset();
    MatTimer.Reset();
    LinalgTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    steps=0;
    for(int k=0;k<MaxIterations;k++){
      cp=GCRnStep(src,psi,rsq);
      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
      if(cp<rsq) {
 	SolverTimer.Stop();
 	Linop.Op(psi,r);
 	axpy(r,-1.0,src,r);
 	RealD tr = norm2(r);
 	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 		 << " computed residual "<<sqrt(cp/ssq)
 		 << " true residual "    <<sqrt(tr/ssq)
 		 << " target "           <<Tolerance <<std::endl;
 	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 	return;
      }
    }
    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
    //    assert(0);
  }
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
    ComplexD a, b, zAz;
    RealD zAAz;
    ComplexD rq;
    GridBase *grid = src.Grid();
    Field r(grid);
    Field z(grid);
    Field tmp(grid);
    Field ttmp(grid);
    Field Az(grid);
    ////////////////////////////////
    // history for flexible orthog
    ////////////////////////////////
    std::vector<Field> q(mmax,grid);
    std::vector<Field> p(mmax,grid);
    std::vector<RealD> qq(mmax);
    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
    //////////////////////////////////
    // initial guess x0 is taken as nonzero.
    // r0=src-A x0 = src
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
    LinalgTimer.Start();
    r=src-Az;
    LinalgTimer.Stop();
    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
    /////////////////////
    // p = Prec(r)
    /////////////////////
    PrecTimer.Start();
    Preconditioner(r,z);
    PrecTimer.Stop();
    MatTimer.Start();
    Linop.Op(z,Az);
    MatTimer.Stop();
    LinalgTimer.Start();
    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    //p[0],q[0],qq[0] 
    p[0]= z;
    q[0]= Az;
    qq[0]= zAAz;
    cp =norm2(r);
    LinalgTimer.Stop();
    for(int k=0;k<nstep;k++){
      steps++;
      int kp     = k+1;
      int peri_k = k %mmax;
      int peri_kp= kp%mmax;
      LinalgTimer.Start();
      rq= innerProduct(q[peri_k],r); // what if rAr not real?
      a = rq/qq[peri_k];
      axpy(psi,a,p[peri_k],psi);         
      cp = axpy_norm(r,-a,q[peri_k],r);
      LinalgTimer.Stop();
      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
      if((k==nstep-1)||(cp<rsq)){
 	return cp;
      }
      PrecTimer.Start();
      Preconditioner(r,z);// solve Az = r
      PrecTimer.Stop();
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);
      LinalgTimer.Start();
      q[peri_kp]=Az;
      p[peri_kp]=z;
      int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
      for(int back=0;back<northog;back++){
 	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
 	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
 	p[peri_kp]=p[peri_kp]+b*p[peri_back];
 	q[peri_kp]=q[peri_kp]+b*q[peri_back];
      }
      qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
      LinalgTimer.Stop();
    }
    assert(0); // never reached
    return cp;
  }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -6,93 +6,6 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
 #ifdef GRID_CUDA
 int PointerCache::Ncache      = 32;
 #else 
 int PointerCache::Ncache      = 8;
 #endif
 int PointerCache::Victim;
 int PointerCache::VictimSmall;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
 PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
 void PointerCache::Init(void)
 {
  char * str;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) Ncache = atoi(str);
  if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) NcacheSmall = atoi(str);
  if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
  //  printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
 }
 void *PointerCache::Insert(void *ptr,size_t bytes) 
 {
  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
    return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
  return Insert(ptr,bytes,Entries,Ncache,Victim);  
 }
 void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) 
 {
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  void * ret = NULL;
  int v = -1;
  for(int e=0;e<ncache;e++) {
    if ( entries[e].valid==0 ) {
      v=e; 
      break;
    }
  }
  if ( v==-1 ) {
    v=victim;
    victim = (victim+1)%ncache;
  }
  if ( entries[v].valid ) {
    ret = entries[v].address;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
  }
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
  return ret;
 }
 void *PointerCache::Lookup(size_t bytes)
 {
  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
    return Lookup(bytes,EntriesSmall,NcacheSmall);
  return Lookup(bytes,Entries,Ncache);
 }
 void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) 
 {
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
      return entries[e].address;
    }
  }
  return NULL;
 }
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
@@ -26,129 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_ALIGNED_ALLOCATOR_H
+#pragma once
 #define GRID_ALIGNED_ALLOCATOR_H
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
 #endif
 #define POINTER_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #define GRID_ALLOC_SMALL_LIMIT (4096)
 NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
 class PointerCache {
 private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
 /* Could make these configurable, perhaps up to a max size*/
  static const int NcacheSmallMax=128; 
  static const int NcacheMax=16;
  static int NcacheSmall;
  static int Ncache;
  typedef struct { 
    void *address;
    size_t bytes;
    int valid;
  } PointerCacheEntry;
  static PointerCacheEntry Entries[NcacheMax];
  static int Victim;
  static PointerCacheEntry EntriesSmall[NcacheSmallMax];
  static int VictimSmall;
 public:
  static void Init(void);
  static void *Insert(void *ptr,size_t bytes) ;
  static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes) ;
  static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
 };
 std::string sizeString(size_t bytes);
 struct MemoryStats
 {
  size_t totalAllocated{0}, maxAllocated{0}, 
    currentlyAllocated{0}, totalFreed{0};
 };
 class MemoryProfiler
 {
 public:
  static MemoryStats *stats;
  static bool        debug;
 };
 #ifdef GRID_NVCC
 #define profilerCudaMeminfo \
  { size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
 #else
 #define profilerCudaMeminfo
 #endif
 #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
 #define profilerDebugPrint						\
  if (MemoryProfiler::stats)						\
    {									\
      auto s = MemoryProfiler::stats;					\
      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
 		<< std::endl;						\
      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
 		<< std::endl;						\
      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
 		<< std::endl;						\
      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
 		<< std::endl;						\
    }									\
  profilerCudaMeminfo;
 #define profilerAllocate(bytes)						\
  if (MemoryProfiler::stats)						\
    {									\
      auto s = MemoryProfiler::stats;					\
      s->totalAllocated     += (bytes);					\
      s->currentlyAllocated += (bytes);					\
      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
    }									\
  if (MemoryProfiler::debug)						\
    {									\
      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
      profilerDebugPrint;						\
    }
 #define profilerFree(bytes)						\
  if (MemoryProfiler::stats)						\
    {									\
      auto s = MemoryProfiler::stats;					\
      s->totalFreed         += (bytes);					\
      s->currentlyAllocated -= (bytes);					\
    }									\
  if (MemoryProfiler::debug)						\
    {									\
      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
      profilerDebugPrint;						\
    }
 void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class alignedAllocator {
 public: 
@@ -172,89 +53,122 @@ public:
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
-
+    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
-
+    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
 #ifdef POINTER_CACHE
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
 #else
    pointer ptr = nullptr;
 #endif
 #ifdef GRID_NVCC
    ////////////////////////////////////
    // Unified (managed) memory
    ////////////////////////////////////
    if ( ptr == (_Tp *) NULL ) {
      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
      auto err = cudaMallocManaged((void **)&ptr,bytes);
      if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
 	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 	assert(0);
      }
    } 
    assert( ptr != (_Tp *)NULL);
 #else 
    //////////////////////////////////////////////////////////////////////////////////////////
    // 2MB align; could make option probably doesn't need configurability
    //////////////////////////////////////////////////////////////////////////////////////////
  #ifdef HAVE_MM_MALLOC_H
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
  #else
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
  #endif
    assert( ptr != (_Tp *)NULL);
    //////////////////////////////////////////////////
    // First touch optimise in threaded loop 
    //////////////////////////////////////////////////
    uint64_t *cp = (uint64_t *)ptr;
    thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
      cp[n]=0;
    });
 #endif
    return ptr;
  }
-  void deallocate(pointer __p, size_type __n) { 
+  void deallocate(pointer __p, size_type __n) 
  { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
-
+    MemoryManager::CpuFree((void *)__p,bytes);
 #ifdef POINTER_CACHE
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #else 
    pointer __freeme = __p;
 #endif
 #ifdef GRID_NVCC
    if ( __freeme ) cudaFree((void *)__freeme);
 #else 
  #ifdef HAVE_MM_MALLOC_H
    if ( __freeme ) _mm_free((void *)__freeme); 
  #else
    if ( __freeme ) free((void *)__freeme);
  #endif
 #endif
  }
-  // FIXME: hack for the copy constructor, eventually it must be avoided
+  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
-  void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
+  void construct(pointer __p, const _Tp& __val) { assert(0);};
  //void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 //////////////////////////////////////////////////////////////////////////////////////
 // Unified virtual memory
 //////////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class uvmAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef uvmAllocator<_Tp1> other; };
  uvmAllocator() throw() { }
  uvmAllocator(const uvmAllocator&) throw() { }
  template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
  ~uvmAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) 
  { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
    MemoryManager::SharedFree((void *)__p,bytes);
  }
  void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Device memory
 ////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class devAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef devAllocator<_Tp1> other; };
  devAllocator() throw() { }
  devAllocator(const devAllocator&) throw() { }
  template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
  ~devAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) 
  { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
    MemoryManager::AcceleratorFree((void *)__p,bytes);
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using commAllocator = alignedAllocator<T>;
+//template<class T> using commAllocator = devAllocator<T>;
-template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
+template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
-template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
+template<class T> using commVector = std::vector<T,devAllocator<T> >;
 template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 NAMESPACE_END(Grid);
-#endif
+
@@ -0,0 +1,4 @@
 #pragma once
 #include <Grid/allocator/MemoryStats.h>
 #include <Grid/allocator/MemoryManager.h>
 #include <Grid/allocator/AlignedAllocator.h>
@@ -0,0 +1,254 @@
 #include <Grid/GridCore.h>
 NAMESPACE_BEGIN(Grid);
 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
 #define CpuSmall (1)
 #define Acc      (2)
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
 }
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
 int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
  }
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
    total_device-=bytes;
    //    PrintBytes();
  }
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
  }
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
    total_shared-=bytes;
    //    PrintBytes();
  }
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
  }
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
    total_host-=bytes;
  }
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
    total_host+=bytes;
  }
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
    total_host-=bytes;
  }
 }
 #endif
 //////////////////////////////////////////
 // call only once
 //////////////////////////////////////////
 void MemoryManager::Init(void)
 {
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
    Nc = atoi(str);
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[Cpu]=Nc;
      Ncache[Acc]=Nc;
      Ncache[Shared]=Nc;
    }
  }
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[CpuSmall]=Nc;
      Ncache[AccSmall]=Nc;
      Ncache[SharedSmall]=Nc;
    }
  }
 }
 void MemoryManager::InitMessage(void) {
 #ifndef GRID_UVM
  std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
 #endif
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 #endif
 #ifdef GRID_UVM
  std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
 #ifdef GRID_CUDA
  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
 #endif
 #ifdef GRID_HIP
  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
 #endif
 #ifdef GRID_SYCL
  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
 #endif
 #else
  std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
 #ifdef GRID_CUDA
  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
 #endif
 #ifdef GRID_HIP
  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
 #endif
 #ifdef GRID_SYCL
  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
 #endif
 #endif
 }
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
 #else
  return ptr;
 #endif
 }
 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  void * ret = NULL;
  int v = -1;
  for(int e=0;e<ncache;e++) {
    if ( entries[e].valid==0 ) {
      v=e; 
      break;
    }
  }
  if ( v==-1 ) {
    v=victim;
    victim = (victim+1)%ncache;
  }
  if ( entries[v].valid ) {
    ret = entries[v].address;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
  }
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
  return ret;
 }
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
  return Lookup(bytes,Entries[cache],Ncache[cache]);
 #else
  return NULL;
 #endif
 }
 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
      return entries[e].address;
    }
  }
  return NULL;
 }
 NAMESPACE_END(Grid);
@@ -0,0 +1,182 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/MemoryManager.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <list> 
 #include <unordered_map>  
 NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
 #define ALLOCATION_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #define GRID_ALLOC_SMALL_LIMIT (4096)
 /*Pinning pages is costly*/
 ////////////////////////////////////////////////////////////////////////////
 // Advise the LatticeAccelerator class
 ////////////////////////////////////////////////////////////////////////////
 enum ViewAdvise {
 AdviseDefault       = 0x0,    // Regular data
 AdviseInfrequentUse = 0x1     // Advise that the data is used infrequently.  This can
                               // significantly influence performance of bulk storage.
 // AdviseTransient      = 0x2,   // Data will mostly be read.  On some architectures
                               // enables read-only copies of memory to be kept on
                               // host and device.
 // AdviseAcceleratorWriteDiscard = 0x4  // Field will be written in entirety on device
 };
 ////////////////////////////////////////////////////////////////////////////
 // View Access Mode
 ////////////////////////////////////////////////////////////////////////////
 enum ViewMode {
  AcceleratorRead  = 0x01,
  AcceleratorWrite = 0x02,
  AcceleratorWriteDiscard = 0x04,
  CpuRead  = 0x08,
  CpuWrite = 0x10,
  CpuWriteDiscard = 0x10 // same for now
 };
 class MemoryManager {
 private:
  ////////////////////////////////////////////////////////////
  // For caching recently freed allocations
  ////////////////////////////////////////////////////////////
  typedef struct { 
    void *address;
    size_t bytes;
    int valid;
  } AllocationCacheEntry;
  static const int NallocCacheMax=128; 
  static const int NallocType=6;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
  static void PrintBytes(void);
 public:
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
  static void  AcceleratorFree    (void *ptr,size_t bytes);
  static void *SharedAllocate(size_t bytes);
  static void  SharedFree    (void *ptr,size_t bytes);
  static void *CpuAllocate(size_t bytes);
  static void  CpuFree    (void *ptr,size_t bytes);
  ////////////////////////////////////////////////////////
  // Footprint tracking
  ////////////////////////////////////////////////////////
  static uint64_t     DeviceBytes;
  static uint64_t     DeviceLRUBytes;
  static uint64_t     DeviceMaxBytes;
  static uint64_t     HostToDeviceBytes;
  static uint64_t     DeviceToHostBytes;
  static uint64_t     HostToDeviceXfer;
  static uint64_t     DeviceToHostXfer;
 private:
 #ifndef GRID_UVM
  //////////////////////////////////////////////////////////////////////
  // Data tables for ViewCache
  //////////////////////////////////////////////////////////////////////
  typedef std::list<uint64_t> LRU_t;
  typedef typename LRU_t::iterator LRUiterator;
  typedef struct { 
    int        LRU_valid;
    LRUiterator LRU_entry;
    uint64_t CpuPtr;
    uint64_t AccPtr;
    size_t   bytes;
    uint32_t transient;
    uint32_t state;
    uint32_t accLock;
    uint32_t cpuLock;
  } AcceleratorViewEntry;
  typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
  typedef typename AccViewTable_t::iterator AccViewTableIterator ;
  static AccViewTable_t AccViewTable;
  static LRU_t LRU;
  /////////////////////////////////////////////////
  // Device motion
  /////////////////////////////////////////////////
  static void  Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
  static void  EvictVictims(uint64_t bytes); // Frees up <bytes>
  static void  Evict(AcceleratorViewEntry &AccCache);
  static void  Flush(AcceleratorViewEntry &AccCache);
  static void  Clone(AcceleratorViewEntry &AccCache);
  static void  AccDiscard(AcceleratorViewEntry &AccCache);
  static void  CpuDiscard(AcceleratorViewEntry &AccCache);
  //  static void  LRUupdate(AcceleratorViewEntry &AccCache);
  static void  LRUinsert(AcceleratorViewEntry &AccCache);
  static void  LRUremove(AcceleratorViewEntry &AccCache);
  // manage entries in the table
  static int                  EntryPresent(uint64_t CpuPtr);
  static void                 EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
  static void                 EntryErase (uint64_t CpuPtr);
  static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
  static void                 EntrySet   (uint64_t CpuPtr,AcceleratorViewEntry &entry);
  static void     AcceleratorViewClose(uint64_t AccPtr);
  static uint64_t AcceleratorViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
  static void     CpuViewClose(uint64_t Ptr);
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 #endif
  static void NotifyDeletion(void * CpuPtr);
 public:
  static void Print(void);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,468 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
 #define dprintf(...)
 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
 ////////////////////////////////////////////////////////////
 MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
 MemoryManager::LRU_t MemoryManager::LRU;
 ////////////////////////////////////////////////////////
 // Footprint tracking
 ////////////////////////////////////////////////////////
 uint64_t  MemoryManager::DeviceBytes;
 uint64_t  MemoryManager::DeviceLRUBytes;
 uint64_t  MemoryManager::DeviceMaxBytes = 1024*1024*128;
 uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 ////////////////////////////////////
 // Priority ordering for unlocked entries
 //  Empty
 //  CpuDirty 
 //  Consistent
 //  AccDirty
 ////////////////////////////////////
 #define Empty         (0x0)  /*Entry unoccupied  */
 #define CpuDirty      (0x1)  /*CPU copy is golden, Acc buffer MAY not be allocated*/
 #define Consistent    (0x2)  /*ACC copy AND CPU copy are valid */
 #define AccDirty      (0x4)  /*ACC copy is golden */
 #define EvictNext     (0x8)  /*Priority for eviction*/
 /////////////////////////////////////////////////
 // Mechanics of data table maintenance
 /////////////////////////////////////////////////
 int   MemoryManager::EntryPresent(uint64_t CpuPtr)
 {
  if(AccViewTable.empty()) return 0;
  auto count = AccViewTable.count(CpuPtr);  assert((count==0)||(count==1));
  return count;
 }
 void  MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
  assert(!EntryPresent(CpuPtr));
  AcceleratorViewEntry AccCache;
  AccCache.CpuPtr = CpuPtr;
  AccCache.AccPtr = (uint64_t)NULL;
  AccCache.bytes  = bytes;
  AccCache.state  = CpuDirty;
  AccCache.LRU_valid=0;
  AccCache.transient=0;
  AccCache.accLock=0;
  AccCache.cpuLock=0;
  AccViewTable[CpuPtr] = AccCache;
 }
 MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
 {
  assert(EntryPresent(CpuPtr));
  auto AccCacheIterator = AccViewTable.find(CpuPtr);
  assert(AccCacheIterator!=AccViewTable.end());
  return AccCacheIterator;
 }
 void MemoryManager::EntryErase(uint64_t CpuPtr)
 {
  auto AccCache = EntryLookup(CpuPtr);
  AccViewTable.erase(CpuPtr);
 }
 void  MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
 {
  assert(AccCache.LRU_valid==0);
  if (AccCache.transient) { 
    LRU.push_back(AccCache.CpuPtr);
    AccCache.LRU_entry = --LRU.end();
  } else {
    LRU.push_front(AccCache.CpuPtr);
    AccCache.LRU_entry = LRU.begin();
  }
  AccCache.LRU_valid = 1;
  DeviceLRUBytes+=AccCache.bytes;
 }
 void  MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
 {
  assert(AccCache.LRU_valid==1);
  LRU.erase(AccCache.LRU_entry);
  AccCache.LRU_valid = 0;
  DeviceLRUBytes-=AccCache.bytes;
 }
 /////////////////////////////////////////////////
 // Accelerator cache motion & consistency logic
 /////////////////////////////////////////////////
 void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 {
  ///////////////////////////////////////////////////////////
  // Remove from Accelerator, remove entry, without flush
  // Cannot be locked. If allocated Must be in LRU pool.
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  //  dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    //    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
 }
 void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 {
  ///////////////////////////////////////////////////////////////////////////
  // Make CPU consistent, remove from Accelerator, remove entry
  // Cannot be locked. If allocated must be in LRU pool.
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  //  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    //    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
 }
 void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 {
  assert(AccCache.state==AccDirty);
  assert(AccCache.cpuLock==0);
  assert(AccCache.accLock==0);
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
  //  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
 }
 void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
 {
  assert(AccCache.state==CpuDirty);
  assert(AccCache.cpuLock==0);
  assert(AccCache.accLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr==(uint64_t)NULL){
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
  //  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
  AccCache.state=Consistent;
 }
 void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 {
  assert(AccCache.state!=Empty);
  assert(AccCache.cpuLock==0);
  assert(AccCache.accLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr==(uint64_t)NULL){
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
  AccCache.state=AccDirty;
 }
 /////////////////////////////////////////////////////////////////////////////////
 // View management
 /////////////////////////////////////////////////////////////////////////////////
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
  } else { 
    assert(0);
  }
 }
 void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
  } else { 
    assert(0);
    return NULL;
  }
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
      assert(LRU.size()>0);
      uint64_t victim = LRU.back();
      auto AccCacheIterator = EntryLookup(victim);
      auto & AccCache = AccCacheIterator->second;
      Evict(AccCache);
    }
  }
 }
 uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
  ////////////////////////////////////////////////////////////////////////////
  // Find if present, otherwise get or force an empty
  ////////////////////////////////////////////////////////////////////////////
  if ( EntryPresent(CpuPtr)==0 ){
    EvictVictims(bytes);
    EntryCreate(CpuPtr,bytes,mode,hint);
  }
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
  assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
  assert(AccCache.cpuLock==0);  // Programming error
  if(AccCache.state!=Empty) {
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
 /*
 *  State transitions and actions
 *
 *  Action  State   StateNext         Flush    Clone
 *
 *  AccRead  Empty   Consistent        -        Y
 *  AccWrite Empty   AccDirty          -        Y
 *  AccRead  CpuDirty Consistent       -        Y
 *  AccWrite CpuDirty AccDirty         -        Y
 *  AccRead  Consistent Consistent     -        - 
 *  AccWrite Consistent AccDirty       -        - 
 *  AccRead  AccDirty   AccDirty       -        - 
 *  AccWrite AccDirty   AccDirty       -        - 
 */
  if(AccCache.state==Empty) {
    assert(AccCache.LRU_valid==0);
    AccCache.CpuPtr = CpuPtr;
    AccCache.AccPtr = (uint64_t)NULL;
    AccCache.bytes  = bytes;
    AccCache.state  = CpuDirty;   // Cpu starts primary
    if(mode==AcceleratorWriteDiscard){
      CpuDiscard(AccCache);
      AccCache.state  = AccDirty;   // Empty + AcceleratorWrite=> AccDirty
    } else if(mode==AcceleratorWrite){
      Clone(AccCache);
      AccCache.state  = AccDirty;   // Empty + AcceleratorWrite=> AccDirty
    } else {
      Clone(AccCache);
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
      AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty
    } else if(mode==AcceleratorWrite) {
      Clone(AccCache);
      AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty
    } else {
      Clone(AccCache);
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
    //    printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
    //    printf("Consistent entry into device accLock %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
    //    printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
  } else {
    assert(0);
  }
  // If view is opened on device remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
    LRUremove(AccCache);
  }
  int transient =hint;
  AccCache.transient= transient? EvictNext : 0;
  return AccCache.AccPtr;
 }
 ////////////////////////////////////
 // look up & decrement lock count
 ////////////////////////////////////
 void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
 {
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
  assert(AccCache.cpuLock==0);
  assert(AccCache.accLock>0);
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
    LRUinsert(AccCache);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
 {
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
  assert(AccCache.cpuLock>0);
  assert(AccCache.accLock==0);
  AccCache.cpuLock--;
 }
 /*
 *  Action  State   StateNext         Flush    Clone
 *
 *  CpuRead  Empty   CpuDirty          -        -
 *  CpuWrite Empty   CpuDirty          -        -
 *  CpuRead  CpuDirty CpuDirty         -        -
 *  CpuWrite CpuDirty CpuDirty         -        - 
 *  CpuRead  Consistent Consistent     -        - 
 *  CpuWrite Consistent CpuDirty       -        - 
 *  CpuRead  AccDirty   Consistent     Y        -
 *  CpuWrite AccDirty   CpuDirty       Y        -
 */
 uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
 {
  ////////////////////////////////////////////////////////////////////////////
  // Find if present, otherwise get or force an empty
  ////////////////////////////////////////////////////////////////////////////
  if ( EntryPresent(CpuPtr)==0 ){
    EvictVictims(bytes);
    EntryCreate(CpuPtr,bytes,mode,transient);
  }
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
  assert((mode==CpuRead)||(mode==CpuWrite));
  assert(AccCache.accLock==0);  // Programming error
  if(AccCache.state!=Empty) {
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes==bytes);
  }
  if(AccCache.state==Empty) {
    AccCache.CpuPtr = CpuPtr;
    AccCache.AccPtr = (uint64_t)NULL;
    AccCache.bytes  = bytes;
    AccCache.state  = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
    AccCache.accLock= 0;
    AccCache.cpuLock= 1;
  } else if(AccCache.state==CpuDirty ){
    // AccPtr dont care, deferred allocate
    AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
    AccCache.cpuLock++;
  } else if(AccCache.state==Consistent) {
    assert(AccCache.AccPtr != (uint64_t)NULL);
    if(mode==CpuWrite)
      AccCache.state = CpuDirty;   // Consistent +CpuWrite => CpuDirty
    else 
      AccCache.state = Consistent; // Consistent +CpuRead  => Consistent
    AccCache.cpuLock++;
  } else if(AccCache.state==AccDirty) {
    assert(AccCache.AccPtr != (uint64_t)NULL);
    Flush(AccCache);
    if(mode==CpuWrite) AccCache.state = CpuDirty;   // AccDirty +CpuWrite => CpuDirty, Flush
    else            AccCache.state = Consistent; // AccDirty +CpuRead  => Consistent, Flush
    AccCache.cpuLock++;
  } else {
    assert(0); // should be unreachable
  }
  AccCache.transient= transient? EvictNext : 0;
  return AccCache.CpuPtr;
 }
 void  MemoryManager::NotifyDeletion(void *_ptr)
 {
  // Look up in ViewCache
  uint64_t ptr = (uint64_t)_ptr;
  if(EntryPresent(ptr)) {
    auto e = EntryLookup(ptr);
    AccDiscard(e->second);
  }
 }
 void  MemoryManager::Print(void)
 {
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 	      << "\t" << AccCache.cpuLock
 	      << "\t" << AccCache.accLock
 	      << "\t" << AccCache.LRU_valid<<std::endl;
  }
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 };
 int   MemoryManager::isOpen   (void* _CpuPtr) 
 { 
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if ( EntryPresent(CpuPtr) ){
    auto AccCacheIterator = EntryLookup(CpuPtr);
    auto & AccCache = AccCacheIterator->second;
    return AccCache.cpuLock+AccCache.accLock;
  } else { 
    return 0;
  }
 }
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,24 @@
 #include <Grid/GridCore.h>
 #ifdef GRID_UVM
 #warning "Grid is assuming unified virtual memory address space"
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////
 // View management is 1:1 address space mapping
 /////////////////////////////////////////////////////////////////////////////////
 uint64_t  MemoryManager::DeviceBytes;
 uint64_t  MemoryManager::DeviceLRUBytes;
 uint64_t  MemoryManager::DeviceMaxBytes = 1024*1024*128;
 uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,67 @@
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
  int fd = open("/proc/self/pagemap", O_RDONLY);
  assert(fd >= 0);
  const int page_size = 4096;
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
  n4ktotal = 0;
  nnothuge = 0;
  for (int i = 0; i < nhugepages; ++i) {
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
    for (int j = 0; j < 512; ++j) {
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
      ++n4ktotal;
      if (pageaddr != baseaddr + j * page_size)
 	++nnothuge;
    }
  }
  int rank = CartesianCommunicator::RankWorld();
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 #endif
 }
 std::string sizeString(const size_t bytes)
 {
  constexpr unsigned int bufSize = 256;
  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
  char                   buf[256];
  size_t                 s     = 0;
  double                 count = bytes;
  while (count >= 1024 && s < 7)
    {
      s++;
      count /= 1024;
    }
  if (count - floor(count) == 0.0)
    {
      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
    }
  else
    {
      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
    }
  return std::string(buf);
 }
 NAMESPACE_END(Grid);
@@ -0,0 +1,95 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/MemoryStats.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 std::string sizeString(size_t bytes);
 struct MemoryStats
 {
  size_t totalAllocated{0}, maxAllocated{0}, 
    currentlyAllocated{0}, totalFreed{0};
 };
 class MemoryProfiler
 {
 public:
  static MemoryStats *stats;
  static bool        debug;
 };
 #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
 #define profilerDebugPrint						\
  if (MemoryProfiler::stats)						\
    {									\
      auto s = MemoryProfiler::stats;					\
      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
 		<< std::endl;						\
      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
 		<< std::endl;						\
      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
 		<< std::endl;						\
      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
 		<< std::endl;						\
    }
 #define profilerAllocate(bytes)						\
  if (MemoryProfiler::stats)						\
    {									\
      auto s = MemoryProfiler::stats;					\
      s->totalAllocated     += (bytes);					\
      s->currentlyAllocated += (bytes);					\
      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
    }									\
  if (MemoryProfiler::debug)						\
    {									\
      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
      profilerDebugPrint;						\
    }
 #define profilerFree(bytes)						\
  if (MemoryProfiler::stats)						\
    {									\
      auto s = MemoryProfiler::stats;					\
      s->totalFreed         += (bytes);					\
      s->currentlyAllocated -= (bytes);					\
    }									\
  if (MemoryProfiler::debug)						\
    {									\
      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
      profilerDebugPrint;						\
    }
 void check_huge_pages(void *Buf,uint64_t BYTES);
 NAMESPACE_END(Grid);
@@ -81,6 +81,7 @@ public:
  bool _isCheckerBoarded; 
  int        LocallyPeriodic;
  Coordinate _checker_dim_mask;
 public:
@@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
 public:
  int dummy;
  Coordinate _checker_dim_mask;
  virtual int  CheckerBoardFromOindexTable (int Oindex) {
    return 0;
  }
@@ -104,6 +105,7 @@ public:
    _ldimensions.resize(_ndimension);
    _rdimensions.resize(_ndimension);
    _simd_layout.resize(_ndimension);
    _checker_dim_mask.resize(_ndimension);;
    _lstart.resize(_ndimension);
    _lend.resize(_ndimension);
@@ -114,6 +116,8 @@ public:
    for (int d = 0; d < _ndimension; d++)
      {
 	_checker_dim_mask[d]=0;
        _fdimensions[d] = dimensions[d];   // Global dimensions
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
        _simd_layout[d] = simd_layout[d];
@@ -36,11 +36,27 @@ static const int CbBlack=1;
 static const int Even   =CbRed;
 static const int Odd    =CbBlack;
 accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
 {
  int nd=rdim.size();
  Coordinate coor(nd);
  Lexicographic::CoorFromIndex(coor,oindex,rdim);
  int linear=0;
  for(int d=0;d<nd;d++){
    if(chk_dim_msk[d])
      linear=linear+coor[d];
  }
  return (linear&0x1);
 }
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
 public:
-  Coordinate _checker_dim_mask;
+  //  Coordinate _checker_dim_mask;
  int              _checker_dim;
  std::vector<int> _checker_board;
@@ -138,21 +138,6 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  void SendRecvPacket(void *xmit,
 		      void *recv,
 		      int xmit_to_rank,
 		      int recv_from_rank,
 		      int bytes);
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			   void *xmit,
 			   int xmit_to_rank,
 			   void *recv,
 			   int recv_from_rank,
 			   int bytes);
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
@@ -43,8 +43,16 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
 #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
    nCommThreads=1;
    // wrong results here too
    // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
    // other comms schemes are ok
    MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
 #else
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
 #endif
    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
    if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
      assert(0);
@@ -294,60 +302,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
-  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
-  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+  // Enforce no UVM in comms, device or host OK
-    MPI_Request xrq;
+  assert(acceleratorIsCommunicable(xmit));
-    MPI_Request rrq;
+  assert(acceleratorIsCommunicable(recv));
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		    recv,bytes,MPI_CHAR,from, from,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);
  }
 }
  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
@@ -403,15 +379,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
@@ -422,6 +390,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 //{
 //}
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
@@ -483,5 +458,3 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t
 }
 NAMESPACE_END(Grid);
@@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int xmit_to_rank,
 					   int recv_from_rank,
 					   int bytes)
 {
  assert(0);
 }
 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
@@ -29,9 +29,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <pwd.h>
-#ifdef GRID_NVCC
+#ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
 #endif
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
@@ -47,7 +50,12 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
 #ifndef GRID_MPI3_SHM_NONE
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
 #else
  MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
 #endif
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
@@ -170,17 +178,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
  std::vector<int> primes({2,3,5});
  int dim = 0;
  int last_dim = ndimension - 1;
  int AutoShmSize = 1;
  while(AutoShmSize != WorldShmSize) {
-    for(int p=0;p<primes.size();p++) {
+    int p;
    for(p=0;p<primes.size();p++) {
      int prime=primes[p];
      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
        && divides(prime,WorldShmSize/AutoShmSize)  ) {
 	AutoShmSize*=prime;
 	ShmDims[dim]*=prime;
 	last_dim = dim;
 	break;
      }
    }
    if (p == primes.size() && last_dim == dim) {
      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
      exit(EXIT_FAILURE);
    }
    dim=(dim+1) %ndimension;
  }
 }
@@ -413,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_NVCC
+#if defined(GRID_CUDA) ||defined(GRID_HIP)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -433,27 +448,18 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
 #ifdef GRID_IBM_SUMMIT
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
    std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
 #else
    std::cout << "setting device to WorldShmRank"<<std::endl;
    cudaSetDevice(WorldShmRank);
 #endif
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-  auto err =  cudaMalloc(&ShmCommBuf, bytes);
+  ShmCommBuf = acceleratorAllocDevice(bytes);
-  if ( err !=  cudaSuccess) {
+
    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
    exit(EXIT_FAILURE);  
  }
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
@@ -462,18 +468,30 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  for(int r=0;r<WorldShmSize;r++){
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
 #ifdef GRID_CUDA
    cudaIpcMemHandle_t handle;
    if ( r==WorldShmRank ) { 
-      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
 #ifdef GRID_HIP
    hipIpcMemHandle_t handle;    
    if ( r==WorldShmRank ) { 
      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  hipSuccess) {
 	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
    //////////////////////////////////////////////////
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
@@ -490,17 +508,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
    void * thisBuf = ShmCommBuf;
 #ifdef GRID_CUDA
    if ( r!=WorldShmRank ) { 
-      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
 #ifdef GRID_HIP
    if ( r!=WorldShmRank ) { 
      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
      if ( err !=  hipSuccess) {
 	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
 #endif
    ///////////////////////////////////////////////////////////////
    // Save a copy of the device buffers
    ///////////////////////////////////////////////////////////////
    WorldShmCommBufs[r] = thisBuf;
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
  }
  _ShmAllocBytes=bytes;
@@ -677,7 +709,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 /////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
-#ifdef GRID_NVCC
+#ifdef GRID_CUDA
  cudaMemset(dest,0,bytes);
 #else
  bzero(dest,bytes);
@@ -685,7 +717,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 }
 void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
 {
-#ifdef GRID_NVCC
+#ifdef GRID_CUDA
  cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
 #else   
  bcopy(src,dest,bytes);
@@ -705,7 +737,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
 #ifndef GRID_MPI3_SHM_NONE
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
 #else
  MPI_Comm_split(comm, rank, 0, &ShmComm);
 #endif
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  ShmCommBufs.resize(ShmSize);
@@ -52,23 +52,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<typename Op, typename T1> 
+template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
-auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
+auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr)) 
    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
 template <class Op, class T1, class T2>
 auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
 template <class Op, class T1, class T2, class T3>
 auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
 				   eval(0, expr.arg2),
 				   eval(0, expr.arg3)))> 
 {
  return Cshift(closure(expr),dim,shift);
 }
@@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 extern Vector<std::pair<int,int> > Cshift_table; 
 ///////////////////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
@@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
  int e2=rhs.Grid()->_slice_block[dimension];
  int ent = 0;
-  static Vector<std::pair<int,int> > table; table.resize(e1*e2);
+  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int stride=rhs.Grid()->_slice_stride[dimension];
  auto rhs_v = rhs.View();
  if ( cbmask == 0x3 ) { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
-	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
+	Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
      }
    }
  } else { 
@@ -65,15 +67,20 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
 	 int o  = n*stride;
 	 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
-	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
+	   Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
       }
     }
  }
-  thread_for(i,ent,{
+  {
-    buffer[table[i].first]=rhs_v[table[i].second];
+    autoView(rhs_v , rhs, AcceleratorRead);
    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
  }
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split 
@@ -95,35 +102,37 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int e2=rhs.Grid()->_slice_block[dimension];
  int n1=rhs.Grid()->_slice_stride[dimension];
  auto rhs_v = rhs.View();
  if ( cbmask ==0x3){
-    thread_for_collapse(2,n,e1,{
+    autoView(rhs_v , rhs, AcceleratorRead);
-      for(int b=0;b<e2;b++){
+    accelerator_for2d(n,e1,b,e2,1,{
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
      }
      });
  } else { 
    autoView(rhs_v , rhs, AcceleratorRead);
-    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+    Coordinate rdim=rhs.Grid()->_rdimensions;
-    // Test_cshift_red_black code.
+    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
-    std::cout << " Dense packed buffer WARNING " <<std::endl;
+    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
-    thread_for_collapse(2,n,e1,{
+    accelerator_for2d(n,e1,b,e2,1,{
-      for(int b=0;b<e2;b++){
+
 	Coordinate coor;
 	int o=n*n1;
-	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
+	int oindex = o+b;
       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
 	int ocb=1<<cb;
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  vobj temp =rhs_v[so+o+b];
 	  extract<vobj>(temp,pointers,offset);
 	}
      }
      });
  }
 }
@@ -145,7 +154,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
  int e2=rhs.Grid()->_slice_block[dimension];
  int stride=rhs.Grid()->_slice_stride[dimension];
-  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int ent    =0;
  if ( cbmask ==0x3 ) {
@@ -154,7 +164,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
      for(int b=0;b<e2;b++){
 	int o   =n*rhs.Grid()->_slice_stride[dimension];
 	int bo  =n*rhs.Grid()->_slice_block[dimension];
-	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
+	Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
      }
    }
@@ -165,17 +175,21 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 	int o   =n*rhs.Grid()->_slice_stride[dimension];
 	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
+	  Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
      }
    }
  }
-  auto rhs_v = rhs.View();
+  {
-  thread_for(i,ent,{
+    autoView( rhs_v, rhs, AcceleratorWrite);
-    rhs_v[table[i].first]=buffer[table[i].second];
+    auto buffer_p = & buffer[0];
    auto table = &Cshift_table[0];
    accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
    });
  }
 }
 //////////////////////////////////////////////////////
 // Scatter for when there *is* need to SIMD split
@@ -194,13 +208,13 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  int e2=rhs.Grid()->_slice_block[dimension];
  if(cbmask ==0x3 ) {
-    auto rhs_v = rhs.View();
+    autoView( rhs_v , rhs, AcceleratorWrite);
-    thread_for_collapse(2,n,e1,{
+    int _slice_stride = rhs.Grid()->_slice_stride[dimension];
-      for(int b=0;b<e2;b++){
+    int _slice_block = rhs.Grid()->_slice_block[dimension];
-	int o      = n*rhs.Grid()->_slice_stride[dimension];
+    accelerator_for2d(n,e1,b,e2,1,{
-	int offset = b+n*rhs.Grid()->_slice_block[dimension];
+	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
      }
      });
  } else { 
@@ -208,7 +222,8 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
    // Test_cshift_red_black code.
    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
-    auto rhs_v = rhs.View();
+    assert(0); // This will fail if hit on GPU
    autoView( rhs_v, rhs, CpuWrite);
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs.Grid()->_slice_stride[dimension];
@@ -225,6 +240,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
 template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
@@ -239,14 +255,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs.Grid()->_slice_block[dimension];
  int stride = rhs.Grid()->_slice_stride[dimension];
-  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+
  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int ent=0;
  if(cbmask == 0x3 ){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
-	table[ent++] = std::pair<int,int>(lo+o,ro+o);
+	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
  } else { 
@@ -255,23 +273,24 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
        int o =n*stride+b;
        int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
-	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
+	  Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
      }
    }
  }
-  auto rhs_v = rhs.View();
+  {
-  auto lhs_v = lhs.View();
+    autoView(rhs_v , rhs, AcceleratorRead);
-  thread_for(i,ent,{
+    autoView(lhs_v , lhs, AcceleratorWrite);
-    lhs_v[table[i].first]=rhs_v[table[i].second];
+    auto table = &Cshift_table[0];
    accelerator_for(i,ent,vobj::Nsimd(),{
      coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
    });
-
+  }
 }
 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 {
  int rd = rhs.Grid()->_rdimensions[dimension];
  if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@@ -285,30 +304,34 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e2=rhs.Grid()->_slice_block [dimension];
  int stride = rhs.Grid()->_slice_stride[dimension];
-  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
  int ent=0;
  if ( cbmask == 0x3 ) {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
-      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+      if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
  }
-  auto rhs_v = rhs.View();
+  {
-  auto lhs_v = lhs.View();
+    autoView( rhs_v, rhs, AcceleratorRead);
-  thread_for(i,ent,{
+    autoView( lhs_v, lhs, AcceleratorWrite);
    auto table = &Cshift_table[0];
    accelerator_for(i,ent,1,{
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
    });
  }
 }
 //////////////////////////////////////////////////////
 // Local to node Cshift
@@ -0,0 +1,4 @@
 #include <Grid/GridCore.h>       
 NAMESPACE_BEGIN(Grid);
 Vector<std::pair<int,int> > Cshift_table; 
 NAMESPACE_END(Grid);
@@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 #include <Grid/lattice/Lattice_view.h>
 #include <Grid/lattice/Lattice_base.h>
 #include <Grid/lattice/Lattice_conformable.h>
 #include <Grid/lattice/Lattice_ET.h>
@@ -36,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
 //#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
 #include <Grid/lattice/Lattice_coordinate.h>
@@ -42,9 +42,24 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // Predicated where support
 ////////////////////////////////////////////////////
 #ifdef GRID_SIMT
 // drop to scalar in SIMT; cleaner in fact
 template <class iobj, class vobj, class robj>
-accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+accelerator_inline vobj predicatedWhere(const iobj &predicate, 
-                            const robj &iffalse) {
+					const vobj &iftrue, 
 					const robj &iffalse) 
 {
  Integer mask = TensorRemove(predicate);
  typename std::remove_const<vobj>::type ret= iffalse;
  if (mask) ret=iftrue;
  return ret;
 }
 #else
 template <class iobj, class vobj, class robj>
 accelerator_inline vobj predicatedWhere(const iobj &predicate, 
 					const vobj &iftrue, 
 					const robj &iffalse) 
 {
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
@@ -68,6 +83,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
  merge(ret, falsevals);
  return ret;
 }
 #endif
 /////////////////////////////////////////////////////
 //Specialization of getVectorType for lattices
@@ -81,26 +97,62 @@ struct getVectorType<Lattice<T> >{
 //--  recursive evaluation of expressions; --
 // handle leaves of syntax tree
 ///////////////////////////////////////////////////
-template<class sobj> accelerator_inline 
+template<class sobj,
  typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr> 
 accelerator_inline 
 sobj eval(const uint64_t ss, const sobj &arg)
 {
  return arg;
 }
 template <class lobj> accelerator_inline 
-const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) 
+auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
 {
-  return arg[ss];
+  return arg(ss);
 }
 ////////////////////////////////////////////
 //--  recursive evaluation of expressions; --
 // whole vector return, used only for expression return type inference
 ///////////////////////////////////////////////////
 template<class sobj> accelerator_inline 
 sobj vecEval(const uint64_t ss, const sobj &arg)
 {
  return arg;
 }
 template <class lobj> accelerator_inline 
-const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
+const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg) 
 {
-  auto view = arg.AcceleratorView(ViewRead);
+  return arg[ss];
  return view[ss];
 }
 ///////////////////////////////////////////////////
 // handle nodes in syntax tree- eval one operand
 // vecEval needed (but never called as all expressions offloaded) to infer the return type
 // in SIMT contexts of closure.
 ///////////////////////////////////////////////////
 template <typename Op, typename T1> accelerator_inline 
 auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
  -> decltype(expr.op.func( vecEval(ss, expr.arg1)))
 {
  return expr.op.func( vecEval(ss, expr.arg1) );
 }
 // vecEval two operands
 template <typename Op, typename T1, typename T2> accelerator_inline
 auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
  -> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
 {
  return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
 }
 // vecEval three operands
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
  -> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
 {
  return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
 }
 ///////////////////////////////////////////////////
 // handle nodes in syntax tree- eval one operand coalesced
 ///////////////////////////////////////////////////
 template <typename Op, typename T1> accelerator_inline 
 auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
@@ -108,23 +160,41 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
 {
  return expr.op.func( eval(ss, expr.arg1) );
 }
 ///////////////////////
 // eval two operands
 ///////////////////////
 template <typename Op, typename T1, typename T2> accelerator_inline
 auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
  -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
 {
  return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
 }
 ///////////////////////
 // eval three operands
 ///////////////////////
 template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
 auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
-  -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
+  -> decltype(expr.op.func(eval(ss, expr.arg1), 
 			   eval(ss, expr.arg2), 
 			   eval(ss, expr.arg3)))
 {
-  return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
+#ifdef GRID_SIMT
  // Handles Nsimd (vInteger) != Nsimd(ComplexD)
  typedef decltype(vecEval(ss, expr.arg2)) rvobj;
  typedef typename std::remove_reference<rvobj>::type vobj;
  const int Nsimd = vobj::vector_type::Nsimd();
  auto vpred = vecEval(ss,expr.arg1);
  ExtractBuffer<Integer> mask(Nsimd);
  extract<vInteger, Integer>(TensorRemove(vpred), mask);
  int s = acceleratorSIMTlane(Nsimd);
  return expr.op.func(mask[s],
 		      eval(ss, expr.arg2), 
 		      eval(ss, expr.arg3));
 #else
  return expr.op.func(eval(ss, expr.arg1),
 		      eval(ss, expr.arg2), 
 		      eval(ss, expr.arg3));
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@@ -180,16 +250,12 @@ inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
  cb = lat.Checkerboard();
 }
 template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
-inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
+inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf
 {
 }
 template <typename Op, typename T1> inline 
 void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) 
 {
  CBFromExpression(cb, expr.arg1);  // recurse AST
 }
 template <typename Op, typename T1, typename T2> inline 
 void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) 
 {
@@ -204,13 +270,74 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
  CBFromExpression(cb, expr.arg3);  // recurse AST
 }
 //////////////////////////////////////////////////////////////////////////
 // ViewOpen
 //////////////////////////////////////////////////////////////////////////
 template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 inline void ExpressionViewOpen(T1 &lat)  // Lattice leaf
 {
  lat.ViewOpen(AcceleratorRead);
 }
 template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
  inline void ExpressionViewOpen(T1 &notlat) {}
 template <typename Op, typename T1> inline 
 void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr) 
 {  
  ExpressionViewOpen(expr.arg1); // recurse AST
 }
 template <typename Op, typename T1, typename T2> inline 
 void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr) 
 {
  ExpressionViewOpen(expr.arg1);  // recurse AST
  ExpressionViewOpen(expr.arg2);  // rrecurse AST
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
 {
  ExpressionViewOpen(expr.arg1);  // recurse AST
  ExpressionViewOpen(expr.arg2);  // recurse AST
  ExpressionViewOpen(expr.arg3);  // recurse AST
 }
 //////////////////////////////////////////////////////////////////////////
 // ViewClose
 //////////////////////////////////////////////////////////////////////////
 template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 inline void ExpressionViewClose( T1 &lat)  // Lattice leaf
 {
  lat.ViewClose();
 }
 template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void ExpressionViewClose(T1 &notlat) {}
 template <typename Op, typename T1> inline 
 void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr) 
 {  
  ExpressionViewClose(expr.arg1); // recurse AST
 }
 template <typename Op, typename T1, typename T2> inline 
 void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr) 
 {
  ExpressionViewClose(expr.arg1);  // recurse AST
  ExpressionViewClose(expr.arg2);  // recurse AST
 }
 template <typename Op, typename T1, typename T2, typename T3>
 inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
 {
  ExpressionViewClose(expr.arg1);  // recurse AST
  ExpressionViewClose(expr.arg2);  // recurse AST
  ExpressionViewClose(expr.arg3);  // recurse AST
 }
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
 #define GridUnopClass(name, ret)					\
  template <class arg>							\
  struct name {								\
-    static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
+    template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
  };
 GridUnopClass(UnarySub, -a);
@@ -221,8 +348,6 @@ GridUnopClass(UnaryTrace, trace(a));
 GridUnopClass(UnaryTranspose, transpose(a));
 GridUnopClass(UnaryTa, Ta(a));
 GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
 GridUnopClass(UnaryReal, real(a));
 GridUnopClass(UnaryImag, imag(a));
 GridUnopClass(UnaryToReal, toReal(a));
 GridUnopClass(UnaryToComplex, toComplex(a));
 GridUnopClass(UnaryTimesI, timesI(a));
@@ -241,10 +366,10 @@ GridUnopClass(UnaryExp, exp(a));
 // Binary operators
 ////////////////////////////////////////////
 #define GridBinOpClass(name, combination)			\
  template <class left, class right>				\
  struct name {							\
    template <class _left, class _right>			\
    static auto accelerator_inline				\
-    func(const left &lhs, const right &rhs)			\
+    func(const _left &lhs, const _right &rhs)			\
      -> decltype(combination) const				\
    {								\
      return combination;					\
@@ -264,10 +389,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
 // Trinary conditional op
 ////////////////////////////////////////////////////
 #define GridTrinOpClass(name, combination)				\
  template <class predicate, class left, class right>			\
  struct name {								\
    template <class _predicate,class _left, class _right>		\
    static auto accelerator_inline					\
-    func(const predicate &pred, const left &lhs, const right &rhs)	\
+    func(const _predicate &pred, const _left &lhs, const _right &rhs)	\
      -> decltype(combination) const					\
    {									\
      return combination;						\
@@ -275,17 +400,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
  };
 GridTrinOpClass(TrinaryWhere,
-		(predicatedWhere<predicate, 
+		(predicatedWhere<
-		 typename std::remove_reference<left>::type,
+		 typename std::remove_reference<_predicate>::type, 
-		 typename std::remove_reference<right>::type>(pred, lhs,rhs)));
+		 typename std::remove_reference<_left>::type,
 		 typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
-
+#define GRID_UNOP(name)   name
-#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
+#define GRID_BINOP(name)  name
-#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) name
 #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_DEF_UNOP(op, name)						\
  template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@@ -337,8 +462,6 @@ GRID_DEF_UNOP(trace, UnaryTrace);
 GRID_DEF_UNOP(transpose, UnaryTranspose);
 GRID_DEF_UNOP(Ta, UnaryTa);
 GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
 GRID_DEF_UNOP(real, UnaryReal);
 GRID_DEF_UNOP(imag, UnaryImag);
 GRID_DEF_UNOP(toReal, UnaryToReal);
 GRID_DEF_UNOP(toComplex, UnaryToComplex);
 GRID_DEF_UNOP(timesI, UnaryTimesI);
@@ -371,29 +494,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
 /////////////////////////////////////////////////////////////
 template <class Op, class T1>
 auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2>
 auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
  return ret;
 }
 template <class Op, class T1, class T2, class T3>
 auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+  -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
-				   eval(0, expr.arg2),
+				   vecEval(0, expr.arg2),
-				   eval(0, expr.arg3)))> 
+				   vecEval(0, expr.arg3)))> 
 {
-  Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+  Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
-				eval(0, expr.arg2),
+				vecEval(0, expr.arg2),
-				eval(0, expr.arg3)))>  ret(expr);
+			        vecEval(0, expr.arg3)))>  ret(expr);
  return ret;
 }
 #define EXPRESSION_CLOSURE(function)					\
  template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
    auto function(Expression &expr) -> decltype(function(closure(expr))) \
  {									\
    return function(closure(expr));					\
  }
 #undef GRID_UNOP
 #undef GRID_BINOP
@@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
-  auto rhs_v = rhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , rhs, AcceleratorRead);
  conformable(ret,rhs);
  conformable(lhs,rhs);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -56,13 +56,13 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
-  auto rhs_v = rhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    auto rhs_t=rhs_v(ss);
    auto tmp  =ret_v(ss);
    mac(&tmp,&lhs_t,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
  });
@@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
-  auto rhs_v = rhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
-  auto rhs_v = rhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    mult(&tmp,&lhs_v(ss),&rhs);
@@ -121,10 +121,10 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto lhs_t=lhs_v(ss);
    mac(&tmp,&lhs_t,&rhs);
    coalescedWrite(ret_v[ss],tmp);
@@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto rhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -179,10 +179,10 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto rhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto rhs_t=rhs_v(ss);
    mac(&tmp,&lhs,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
@@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto rhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto rhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -221,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto x_v = x.AcceleratorView(ViewRead);
+  autoView( x_v , x, AcceleratorRead);
-  auto y_v = y.AcceleratorView(ViewRead);
+  autoView( y_v , y, AcceleratorRead);
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
    auto tmp = a*x_v(ss)+y_v(ss);
    coalescedWrite(ret_v[ss],tmp);
@@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
-  auto ret_v = ret.AcceleratorView(ViewWrite);
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto x_v = x.AcceleratorView(ViewRead);
+  autoView( x_v , x, AcceleratorRead);
-  auto y_v = y.AcceleratorView(ViewRead);
+  autoView( y_v , y, AcceleratorRead);
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
    auto tmp = a*x_v(ss)+b*y_v(ss);
    coalescedWrite(ret_v[ss],tmp);
@@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once 
 #define STREAMING_STORES
@@ -37,180 +38,6 @@ NAMESPACE_BEGIN(Grid);
 extern int GridCshiftPermuteMap[4][16];
 ///////////////////////////////////////////////////////////////////
 // Base class which can be used by traits to pick up behaviour
 ///////////////////////////////////////////////////////////////////
 class LatticeBase {};
 /////////////////////////////////////////////////////////////////////////////////////////
 // Conformable checks; same instance of Grid required
 /////////////////////////////////////////////////////////////////////////////////////////
 void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
 {
  assert(lhs == rhs);
 }
 ////////////////////////////////////////////////////////////////////////////
 // Advise the LatticeAccelerator class
 ////////////////////////////////////////////////////////////////////////////
 enum LatticeAcceleratorAdvise {
  AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
                                // significantly influence performance of bulk storage.
  AdviseReadMostly = 0x2,       // Data will mostly be read.  On some architectures
                                // enables read-only copies of memory to be kept on
                                // host and device.
 };
 ////////////////////////////////////////////////////////////////////////////
 // View Access Mode
 ////////////////////////////////////////////////////////////////////////////
 enum ViewMode {
  ViewRead = 0x1,
  ViewWrite = 0x2,
  ViewReadWrite = 0x3
 };
 ////////////////////////////////////////////////////////////////////////////
 // Minimal base class containing only data valid to access from accelerator
 // _odata will be a managed pointer in CUDA
 ////////////////////////////////////////////////////////////////////////////
 // Force access to lattice through a view object.
 // prevents writing of code that will not offload to GPU, but perhaps annoyingly
 // strict since host could could in principle direct access through the lattice object
 // Need to decide programming model.
 #define LATTICE_VIEW_STRICT
 template<class vobj> class LatticeAccelerator : public LatticeBase
 {
 protected:
  GridBase *_grid;
  int checkerboard;
  vobj     *_odata;    // A managed pointer
  uint64_t _odata_size;    
 public:
  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { }; 
  accelerator_inline uint64_t oSites(void) const { return _odata_size; };
  accelerator_inline int  Checkerboard(void) const { return checkerboard; };
  accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
  accelerator_inline void Conformable(GridBase * &grid) const
  { 
    if (grid) conformable(grid, _grid);
    else      grid = _grid;
  };
  accelerator_inline void Advise(int advise) {
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
    if (advise & AdviseInfrequentUse) {
      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
    }
    if (advise & AdviseReadMostly) {
      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
    }
 #endif
 #endif
  };
  accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
    int target;
    cudaGetDevice(&target);
    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
 #endif
 #endif
  };
  accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
 #endif
 #endif
  };
 };
 /////////////////////////////////////////////////////////////////////////////////////////
 // A View class which provides accessor to the data.
 // This will be safe to call from accelerator_for and is trivially copy constructible
 // The copy constructor for this will need to be used by device lambda functions
 /////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> 
 class LatticeView : public LatticeAccelerator<vobj>
 {
 public:
  // Rvalue
 #ifdef __CUDA_ARCH__
  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
 #else 
  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 #endif
  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
  accelerator_inline uint64_t begin(void) const { return 0;};
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
  {
  }
 };
 /////////////////////////////////////////////////////////////////////////////////////////
 // Lattice expression types used by ET to assemble the AST
 // 
 // Need to be able to detect code paths according to the whether a lattice object or not
 // so introduce some trait type things
 /////////////////////////////////////////////////////////////////////////////////////////
 class LatticeExpressionBase {};
 template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
 template<class T>                 struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
 template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
 template <typename Op, typename _T1>                           
 class LatticeUnaryExpression : public  LatticeExpressionBase 
 {
 public:
  typedef typename ViewMap<_T1>::Type T1;
  Op op;
  T1 arg1;
  LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
 };
 template <typename Op, typename _T1, typename _T2>              
 class LatticeBinaryExpression : public LatticeExpressionBase 
 {
 public:
  typedef typename ViewMap<_T1>::Type T1;
  typedef typename ViewMap<_T2>::Type T2;
  Op op;
  T1 arg1;
  T2 arg2;
  LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
 };
 template <typename Op, typename _T1, typename _T2, typename _T3> 
 class LatticeTrinaryExpression : public LatticeExpressionBase 
 {
 public:
  typedef typename ViewMap<_T1>::Type T1;
  typedef typename ViewMap<_T2>::Type T2;
  typedef typename ViewMap<_T3>::Type T3;
  Op op;
  T1 arg1;
  T2 arg2;
  T3 arg3;
  LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////
 // The real lattice class, with normal copy and assignment semantics.
 // This contains extra (host resident) grid pointer data that may be accessed by host code
@@ -253,28 +80,23 @@ private:
    }
  }
 public:
  /////////////////////////////////////////////////////////////////////////////////
  // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
  /////////////////////////////////////////////////////////////////////////////////
  void SetViewMode(ViewMode mode) {
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
  // in device lambdas
  /////////////////////////////////////////////////////////////////////////////////
  LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
  {                                   //                     and HostView        for thread_for
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
    return accessor;
  }
-  LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const 
+  LatticeView<vobj> View (ViewMode mode) const 
  {
-    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.AcceleratorPrefetch(mode);
    return accessor;
  }
  LatticeView<vobj> HostView(int mode = ViewReadWrite) const 
  {
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
    accessor.HostPrefetch(mode);
    return accessor;
  }
@@ -298,11 +120,15 @@ public:
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-    auto me  = AcceleratorView(ViewWrite);
+    auto exprCopy = expr;
-    accelerator_for(ss,me.size(),1,{
+    ExpressionViewOpen(exprCopy);
-      auto tmp = eval(ss,expr);
+    auto me  = View(AcceleratorWriteDiscard);
-      vstream(me[ss],tmp);
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
    return *this;
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@@ -317,11 +143,15 @@ public:
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-    auto me  = AcceleratorView(ViewWrite);
+    auto exprCopy = expr;
-    accelerator_for(ss,me.size(),1,{
+    ExpressionViewOpen(exprCopy);
-      auto tmp = eval(ss,expr);
+    auto me  = View(AcceleratorWriteDiscard);
-      vstream(me[ss],tmp);
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
    return *this;
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@@ -335,11 +165,15 @@ public:
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-    auto me  = AcceleratorView(ViewWrite);
+    auto exprCopy = expr;
-    accelerator_for(ss,me.size(),1,{
+    ExpressionViewOpen(exprCopy);
-      auto tmp = eval(ss,expr);
+    auto me  = View(AcceleratorWriteDiscard);
-      vstream(me[ss],tmp);
+    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      auto tmp = eval(ss,exprCopy);
      coalescedWrite(me[ss],tmp);
    });
    me.ViewClose();
    ExpressionViewClose(exprCopy);
    return *this;
  }
  //GridFromExpression is tricky to do
@@ -390,10 +224,11 @@ public:
  }
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
-    auto me  = View();
+    auto me  = View(CpuWrite);
    thread_for(ss,me.size(),{
 	me[ss]= r;
    });
    me.ViewClose();
    return *this;
  }
@@ -403,11 +238,12 @@ public:
  ///////////////////////////////////////////
  // user defined constructor
  ///////////////////////////////////////////
-  Lattice(GridBase *grid) { 
+  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
    this->_grid = grid;
    resize(this->_grid->oSites());
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
    this->checkerboard=0;
    SetViewMode(mode);
  }
  //  virtual ~Lattice(void) = default;
@@ -445,11 +281,12 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
-    auto me =   AcceleratorView(ViewWrite);
+    auto me =   View(AcceleratorWriteDiscard);
-    auto him= r.AcceleratorView(ViewRead);
+    auto him= r.View(AcceleratorRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
    me.ViewClose();    him.ViewClose();
    return *this;
  }
@@ -459,11 +296,12 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
-    auto me =   AcceleratorView(ViewWrite);
+    auto me =   View(AcceleratorWriteDiscard);
-    auto him= r.AcceleratorView(ViewRead);
+    auto him= r.View(AcceleratorRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
    me.ViewClose();    him.ViewClose();
    return *this;
  }
  ///////////////////////////////////////////
@@ -51,20 +51,23 @@ template<class VField, class Matrix>
 void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
  typedef decltype(basis[0]) Field;
-  typedef decltype(basis[0].View()) View;
+  typedef decltype(basis[0].View(AcceleratorRead)) View;
-  auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
+
-  Vector<View> basis_v(basis.size(),tmp_v);
+  Vector<View> basis_v; basis_v.reserve(basis.size());
-  typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
+  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
  typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
  GridBase* grid = basis[0].Grid();
  for(int k=0;k<basis.size();k++){
-    basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
+    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }
-#ifndef GRID_NVCC
+#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
    {
-    std::vector < vobj > B(Nm); // Thread private
+      vobj* B = &Bt[Nm * thread_num()];
      thread_for_in_region(ss, grid->oSites(),{
 	  for(int j=j0; j<j1; ++j) B[j]=0.;
@@ -79,6 +82,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 	});
    }
 #else
  View *basis_vp = &basis_v[0];
  int nrot = j1-j0;
  if (!nrot) // edge case not handled gracefully by Cuda
    return;
@@ -90,8 +95,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
  auto Bp=&Bt[0];
  // GPU readable copy of matrix
-  Vector<double> Qt_jv(Nm*Nm);
+  Vector<Coeff_t> Qt_jv(Nm*Nm);
-  double *Qt_p = & Qt_jv[0];
+  Coeff_t *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
@@ -133,26 +138,30 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
      });
  }
 #endif
  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
-  typedef decltype(basis[0].AcceleratorView()) View;
+  typedef decltype(basis[0].View(AcceleratorRead)) View;
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
  result.Checkerboard() = basis[0].Checkerboard();
-  auto result_v=result.AcceleratorView(ViewWrite);
+
-  Vector<View> basis_v(basis.size(),result_v);
+  Vector<View> basis_v; basis_v.reserve(basis.size());
  for(int k=0;k<basis.size();k++){
-    basis_v[k] = basis[k].AcceleratorView(ViewRead);
+    basis_v.push_back(basis[k].View(AcceleratorRead));
  }
  vobj zz=Zero();
  Vector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
  autoView(result_v,result,AcceleratorWrite);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    auto B=coalescedRead(zz);
    for(int k=k0; k<k1; ++k){
@@ -160,6 +169,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
    }
    coalescedWrite(result_v[ss], B);
  });
  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 }
 template<class Field>
@@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid);
 typedef iScalar<vInteger> vPredicate ;
 /*
 template <class iobj, class vobj, class robj> accelerator_inline 
 vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse) 
 {
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = vobj::vector_type::Nsimd();
  ExtractBuffer<Integer> mask(Nsimd);
  ExtractBuffer<scalar_object> truevals(Nsimd);
  ExtractBuffer<scalar_object> falsevals(Nsimd);
  extract(iftrue, truevals);
  extract(iffalse, falsevals);
  extract<vInteger, Integer>(TensorRemove(predicate), mask);
  for (int s = 0; s < Nsimd; s++) {
    if (mask[s]) falsevals[s] = truevals[s];
  }
  merge(ret, falsevals);
  return ret;
 }
 */
 //////////////////////////////////////////////////////////////////////////
 // compare lattice to lattice
 //////////////////////////////////////////////////////////////////////////
@@ -78,9 +50,9 @@ template<class vfunctor,class lobj,class robj>
 inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
 {
  Lattice<vPredicate> ret(rhs.Grid());
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, CpuRead);
-  auto rhs_v = rhs.View();
+  autoView( rhs_v, rhs, CpuRead);
-  auto ret_v = ret.View();
+  autoView( ret_v, ret, CpuWrite);
  thread_for( ss, rhs_v.size(), {
      ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
  });
@@ -93,8 +65,8 @@ template<class vfunctor,class lobj,class robj>
 inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
 {
  Lattice<vPredicate> ret(lhs.Grid());
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, CpuRead);
-  auto ret_v = ret.View();
+  autoView( ret_v, ret, CpuWrite);
  thread_for( ss, lhs_v.size(), {
    ret_v[ss]=op(lhs_v[ss],rhs);
  });
@@ -107,8 +79,8 @@ template<class vfunctor,class lobj,class robj>
 inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
 {
  Lattice<vPredicate> ret(rhs.Grid());
-  auto rhs_v = rhs.View();
+  autoView( rhs_v, rhs, CpuRead);
-  auto ret_v = ret.View();
+  autoView( ret_v, ret, CpuWrite);
  thread_for( ss, rhs_v.size(), {
    ret_v[ss]=op(lhs,rhs_v[ss]);
  });
@@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();
-  auto l_v = l.View();
+  autoView(l_v, l, CpuWrite);
  thread_for( o, grid->oSites(), {
    vector_type vI;
    Coordinate gcoor;
@@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  });
 };
 // LatticeCoordinate();
 // FIXME for debug; deprecate this; made obscelete by 
 template<class vobj> void lex_sites(Lattice<vobj> &l){
  auto l_v = l.View();
  Real *v_ptr = (Real *)&l_v[0];
  size_t o_len = l.Grid()->oSites();
  size_t v_len = sizeof(vobj)/sizeof(vRealF);
  size_t vec_len = vRealF::Nsimd();
  for(int i=0;i<o_len;i++){
    for(int j=0;j<v_len;j++){
      for(int vv=0;vv<vec_len;vv+=2){
 	v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
 	v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
      }
    }}
 }
 NAMESPACE_END(Grid);
@@ -43,8 +43,8 @@ template<class vobj>
 inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
 {
  Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
-  auto rhs_v = rhs.View();
+  autoView( rhs_v , rhs, AcceleratorRead);
-  auto ret_v = ret.View();
+  autoView( ret_v , ret, AcceleratorWrite);
  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
    coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
  });
@@ -56,9 +56,9 @@ template<class vobj>
 inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
 {
  Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
-  auto lhs_v = lhs.View();
+  autoView( lhs_v , lhs, AcceleratorRead);
-  auto rhs_v = rhs.View();
+  autoView( rhs_v , rhs, AcceleratorRead);
-  auto ret_v = ret.View();
+  autoView( ret_v , ret, AcceleratorWrite);
  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
    coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
  });
@@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
  typedef decltype(coalescedRead(ll())) sll;
  typedef decltype(coalescedRead(rr())) srr;
  Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
-  auto lhs_v = lhs.View();
+  autoView( lhs_v , lhs, AcceleratorRead);
-  auto rhs_v = rhs.View();
+  autoView( rhs_v , rhs, AcceleratorRead);
-  auto ret_v = ret.View();
+  autoView( ret_v , ret, AcceleratorWrite);
  accelerator_for(ss,rhs_v.size(),1,{
    // FIXME had issues with scalar version of outer 
    // Use vector [] operator and don't read coalesce this loop
@@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
-  auto X_v = X.View();
+  autoView( X_v , X, CpuRead);
-  auto Y_v = Y.View();
+  autoView( Y_v , Y, CpuRead);
-  auto R_v = R.View();
+  autoView( R_v , R, CpuWrite);
  thread_region
  {
    std::vector<vobj> s_x(Nblock);
@@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
-  auto X_v = X.View();
+  autoView( X_v , X, CpuRead);
-  auto R_v = R.View();
+  autoView( R_v , R, CpuWrite);
  thread_region
  {
@@ -156,8 +156,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
-  auto lhs_v = lhs.View();
+  autoView( lhs_v , lhs, CpuRead);
-  auto rhs_v = rhs.View();
+  autoView( rhs_v , rhs, CpuRead);
  thread_region {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
@@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
 {
  Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
  ret.Checkerboard()=lhs.Checkerboard();
-  auto ret_v = ret.View();
+  autoView( ret_v, ret, AcceleratorWrite);
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, AcceleratorRead);
-  thread_for( ss, lhs_v.size(), {
+  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
  });
  return ret;
@@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
 {
  Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
  ret.Checkerboard()=lhs.Checkerboard();
-  auto ret_v = ret.View();
+  autoView( ret_v, ret, AcceleratorWrite);
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, AcceleratorRead);
-  thread_for( ss, lhs_v.size(), {
+  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
  });
  return ret;
@@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
 template<int Index,class vobj>  
 void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
 {
-  auto rhs_v = rhs.View();
+  autoView( rhs_v, rhs, AcceleratorRead);
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, AcceleratorWrite);
-  thread_for( ss, lhs_v.size(), {
+  accelerator_for( ss, lhs_v.size(), 1, {
    pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
  });
 }
 template<int Index,class vobj> 
 void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
 {
-  auto rhs_v = rhs.View();
+  autoView( rhs_v, rhs, AcceleratorRead);
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, AcceleratorWrite);
-  thread_for( ss, lhs_v.size(), {
+  accelerator_for( ss, lhs_v.size(), 1, {
    pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
  });
 }
@@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
  // extract-modify-merge cycle is easiest way and this is not perf critical
  ExtractBuffer<sobj> buf(Nsimd);
-  auto l_v = l.View();
+  autoView( l_v , l, CpuWrite);
  if ( rank == grid->ThisRank() ) {
    extract(l_v[odx],buf);
    buf[idx] = s;
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  grid->GlobalCoorToRankIndex(rank,odx,idx,site);
  ExtractBuffer<sobj> buf(Nsimd);
-  auto l_v = l.View();
+  autoView( l_v , l, CpuWrite);
  extract(l_v[odx],buf);
  s = buf[idx];
@@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  return;
 };
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 // Must be CPU read view
 template<class vobj,class sobj>
-inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
-        
+{
-  GridBase *grid = l.Grid();
+  GridBase *grid = l.getGrid();
-
+  assert(l.mode==CpuRead);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
-  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
+  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
  static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  auto l_v = l.View();
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * vp = (scalar_type *)&l_v[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
@@ -183,18 +182,27 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
  return;
 };
 template<class vobj,class sobj>
-inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
 {
  autoView(lv,l,CpuRead);
  peekLocalSite(s,lv,site);
  return;
 };
-  GridBase *grid=l.Grid();
+// Must be CPU write view
 template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
 {
  GridBase *grid=l.getGrid();
  assert(l.mode==CpuWrite);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
-  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
+  assert( l.Checkerboard()== grid->CheckerBoard(site));
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
  static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -202,13 +210,19 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  auto l_v = l.View();
+  scalar_type * vp = (scalar_type *)&l[odx];
  scalar_type * vp = (scalar_type *)&l_v[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
    vp[idx+w*Nsimd] = pt[w];
  }
  return;
 };
 template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
 {
  autoView(lv,l,CpuWrite);
  pokeLocalSite(s,lv,site);
  return;
 };
@@ -0,0 +1,79 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reality.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LATTICE_REAL_IMAG_H
 #define GRID_LATTICE_REAL_IMAG_H
 // FIXME .. this is the sector of the code 
 // I am most worried about the directions
 // The choice of burying complex in the SIMD
 // is making the use of "real" and "imag" very cumbersome
 NAMESPACE_BEGIN(Grid);
 template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] =real(lhs_v[ss]);
  });
  return ret;
 };
 template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
  accelerator_for( ss, lhs_v.size(), 1, {
    ret_v[ss] =imag(lhs_v[ss]);
  });
  return ret;
 };
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
  auto real(const Expression &expr) -> decltype(real(closure(expr)))		
 {									
  return real(closure(expr));					
 }
 template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> 
  auto imag(const Expression &expr) -> decltype(imag(closure(expr)))		
 {									
  return imag(closure(expr));					
 }
 NAMESPACE_END(Grid);
 #endif
@@ -40,9 +40,11 @@ NAMESPACE_BEGIN(Grid);
 template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard()=lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
    coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
  });
@@ -51,9 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  autoView( lhs_v, lhs, AcceleratorRead);
  autoView( ret_v, ret, AcceleratorWrite);
  ret.Checkerboard() = lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
    coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
  });
@@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #include <Grid/Grid_Eigen_Dense.h>
-#ifdef GRID_NVCC
+#if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
@@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
 {
  typedef typename vobj::scalar_object  sobj;
-  const int Nsimd = vobj::Nsimd();
+  //  const int Nsimd = vobj::Nsimd();
  const int nthread = GridThread::GetThreads();
  Vector<sobj> sumarray(nthread);
  for(int i=0;i<nthread;i++){
    sumarray[i]=Zero();
  }
  thread_for(thr,nthread, {
    int nwork, mywork, myoff;
    nwork = osites;
    GridThread::GetWork(nwork,thr,mywork,myoff);
    vobj vvsum=Zero();
    for(int ss=myoff;ss<mywork+myoff; ss++){
      vvsum = vvsum + arg[ss];
    }
    sumarray[thr]=Reduce(vvsum);
  });
  sobj ssum=Zero();  // sum across threads
  for(int i=0;i<nthread;i++){
    ssum = ssum+sumarray[i];
  } 
  return ssum;
 }
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
 {
  typedef typename vobj::scalar_objectD  sobj;
  const int nthread = GridThread::GetThreads();
  Vector<sobj> sumarray(nthread);
@@ -63,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
    ssum = ssum+sumarray[i];
  } 
-  return ssum;
+  typedef typename vobj::scalar_object ssobj;
  ssobj ret = ssum;
  return ret;
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#ifdef GRID_NVCC
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-  auto arg_v = arg.View();
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum(&arg_v[0],osites);
+  auto ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
@@ -102,42 +151,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  GridBase *grid = left.Grid();
  // Might make all code paths go this way.
  auto left_v = left.AcceleratorView(ViewRead);
  auto right_v=right.AcceleratorView(ViewRead);
  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
-#ifdef GRID_NVCC
+  // Might make all code paths go this way.
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
-  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
+    accelerator_for( ss, sites, 1,{
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto x_l = left_v(ss);
      auto y_l = right_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
  })
  // This is in single precision and fails some tests
  // Need a sumD that sums in double
  nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));  
 #else
  // CPU 
  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
 	auto x_l = left_v[ss];
 	auto y_l = right_v[ss];
 	inner_tmp_v[ss]=innerProductD(x_l,y_l);
-  })
+    });
-  nrm = TensorRemove(sum(inner_tmp_v,sites));
+  }
-#endif
+
  // This is in single precision and fails some tests
  auto anrm = sum(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@@ -175,40 +211,24 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  GridBase *grid = x.Grid();
  auto x_v=x.AcceleratorView(ViewRead);
  auto y_v=y.AcceleratorView(ViewRead);
  auto z_v=z.AcceleratorView(ViewWrite);
  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
 #ifdef GRID_NVCC
  // GPU
-  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
+  autoView( x_v, x, AcceleratorRead);
-  Vector<inner_t> inner_tmp(sites);
+  autoView( y_v, y, AcceleratorRead);
-  auto inner_tmp_v = &inner_tmp[0];
+  autoView( z_v, z, AcceleratorWrite);
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
 #else
  // CPU 
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-  accelerator_for( ss, sites, nsimd,{
+  accelerator_for( ss, sites, 1,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
+      auto tmp = a*x_v[ss]+b*y_v[ss];
      inner_tmp_v[ss]=innerProductD(tmp,tmp);
      z_v[ss]=tmp;
  });
  // Already promoted to double
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@@ -224,47 +244,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
  GridBase *grid = left.Grid();
  auto left_v=left.AcceleratorView(ViewRead);
  auto right_v=right.AcceleratorView(ViewRead);
  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
 #ifdef GRID_NVCC
  // GPU
-  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
+  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
-  typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
+  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
  Vector<inner_t> inner_tmp(sites);
  Vector<norm_t>  norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
-
+  {
-  accelerator_for( ss, sites, nsimd,{
+    autoView(left_v,left, AcceleratorRead);
-      auto left_tmp = left_v(ss);
+    autoView(right_v,right,AcceleratorRead);
-      coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
+    accelerator_for( ss, sites, 1,{
-      coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
+	auto left_tmp = left_v[ss];
-  });
+	inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
  tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
  tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
 #else
  // CPU
  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
  typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
  Vector<inner_t> inner_tmp(sites);
  Vector<norm_t> norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto left_tmp = left_v(ss);
      inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
        norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
      });
-  // Already promoted to double
+  }
  tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
  tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
-#endif
+
  grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
  ip = tmp[0];
  nrm = real(tmp[1]);
@@ -335,7 +337,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  // sum over reduced dimension planes, breaking out orthog dir
  // Parallel over orthog direction
-  auto Data_v=Data.View();
+  autoView( Data_v, Data, CpuRead);
  thread_for( r,rd, {
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
    for(int n=0;n<e1;n++){
@@ -413,8 +415,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];
-  auto lhv=lhs.View();
+  autoView( lhv, lhs, CpuRead);
-  auto rhv=rhs.View();
+  autoView( rhv, rhs, CpuRead);
  thread_for( r,rd,{
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
@@ -521,14 +523,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
    tensor_reduced at; at=av;
-    auto Rv=R.View();
+    autoView( Rv, R, CpuWrite);
-    auto Xv=X.View();
+    autoView( Xv, X, CpuRead);
-    auto Yv=Y.View();
+    autoView( Yv, Y, CpuRead);
-    thread_for_collapse(2, n, e1, {
+    thread_for2d( n, e1, b,e2, {
      for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	Rv[ss] = at*Xv[ss]+Yv[ss];
      }
    });
  }
 };
@@ -581,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
-  auto X_v=X.View();
+  autoView( X_v, X, CpuRead);
-  auto Y_v=Y.View();
+  autoView( Y_v, Y, CpuRead);
-  auto R_v=R.View();
+  autoView( R_v, R, CpuWrite);
  thread_region
  {
    Vector<vobj> s_x(Nblock);
@@ -628,13 +628,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  //  int nl=1;
  //FIXME package in a convenient iterator
  // thread_for2d_in_region
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
-  auto R_v = R.View();
+  autoView( R_v, R, CpuWrite);
-  auto X_v = X.View();
+  autoView( X_v, X, CpuRead);
  thread_region
  {
    std::vector<vobj> s_x(Nblock);
@@ -692,8 +693,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  typedef typename vobj::vector_typeD vector_typeD;
-  auto lhs_v=lhs.View();
+  autoView( lhs_v, lhs, CpuRead);
-  auto rhs_v=rhs.View();
+  autoView( rhs_v, rhs, CpuRead);
  thread_region
  {
    std::vector<vobj> Left(Nblock);
@@ -1,7 +1,14 @@
 NAMESPACE_BEGIN(Grid);
-#define WARP_SIZE 32
+#ifdef GRID_HIP
 extern hipDeviceProp_t *gpu_props;
 #define WARP_SIZE 64
 #endif
 #ifdef GRID_CUDA
 extern cudaDeviceProp *gpu_props;
 #define WARP_SIZE 32
 #endif
 __device__ unsigned int retirementCount = 0;
 template <class Iterator>
@@ -19,7 +26,12 @@ template <class Iterator>
 void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  int device;
 #ifdef GRID_CUDA
  cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
  hipGetDevice(&device);
 #endif
  Iterator warpSize            = gpu_props[device].warpSize;
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
@@ -53,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
  // cannot use overloaded operators for sobj as they are not volatile-qualified
  memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
-  __syncwarp();
+  acceleratorSynchronise();
  const Iterator VEC = WARP_SIZE;
  const Iterator vid = tid & (VEC-1);
@@ -67,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
      beta += temp;
      memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
    }
-    __syncwarp();
+    acceleratorSynchronise();
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
  if (threadIdx.x == 0) {
    beta  = Zero();
@@ -79,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
    }
    memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
 }
@@ -147,7 +159,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
    sobj *smem = (sobj *)shmem_pointer;
    // wait until all outstanding memory instructions in this thread are finished
-    __threadfence();
+    acceleratorFence();
    if (tid==0) {
      unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
@@ -156,7 +168,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
    }
    // each thread must read the correct value of amLast
-    __syncthreads();
+    acceleratorSynchroniseAll();
    if (amLast) {
      // reduce buffer[0], ..., buffer[gridDim.x-1]
@@ -199,13 +211,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
  sobj *buffer_v = &buffer[0];
  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
-  cudaDeviceSynchronize();
+  accelerator_barrier();
  cudaError err = cudaGetLastError();
  if ( cudaSuccess != err ) {
    printf("Cuda error %s\n",cudaGetErrorString( err ));
    exit(0);
  }
  auto result = buffer_v[0];
  return result;
 }
@@ -375,7 +375,7 @@ public:
    int osites = _grid->oSites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
    int words  = sizeof(scalar_object) / sizeof(scalar_type);
-    auto l_v = l.View();
+    autoView(l_v, l, CpuWrite);
    thread_for( ss, osites, {
      ExtractBuffer<scalar_object> buf(Nsimd);
      for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
@@ -462,7 +462,7 @@ public:
    {
      // Obtain one reseeded generator per thread      
-      int Nthread = GridThread::GetThreads();
+      int Nthread = 32; // Hardwire a good level or parallelism
      std::vector<RngEngine> seeders(Nthread);
      for(int t=0;t<Nthread;t++){
 	seeders[t] = Reseed(master_engine);
@@ -42,8 +42,8 @@ template<class vobj>
 inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
 {
  Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
-  auto ret_v = ret.View();
+  autoView(ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.View();
+  autoView(lhs_v , lhs, AcceleratorRead);
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
    coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
  });
@@ -58,8 +58,8 @@ template<int Index,class vobj>
 inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
 {
  Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
-  auto ret_v = ret.View();
+  autoView( ret_v , ret, AcceleratorWrite);
-  auto lhs_v = lhs.View();
+  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
    coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
  });
@@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // remove and insert a half checkerboard
 ////////////////////////////////////////////////////////////////////////////////////////////
-template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
+template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
 {
  half.Checkerboard() = cb;
-  auto half_v = half.View();
+  autoView( half_v, half, CpuWrite);
-  auto full_v = full.View();
+  autoView( full_v, full, CpuRead);
  thread_for(ss, full.Grid()->oSites(),{
    int cbos;
    Coordinate coor;
@@ -64,11 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
    }
  });
 }
-
+template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
-template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
+{
  int cb = half.Checkerboard();
-  auto half_v = half.View();
+  autoView( half_v , half, CpuRead);
-  auto full_v = full.View();
+  autoView( full_v , full, CpuWrite);
  thread_for(ss,full.Grid()->oSites(),{
    Coordinate coor;
@@ -96,15 +97,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
  out = in;
 }
-#ifdef __CUDA_ARCH__
+#ifdef GRID_SIMT
 accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
-  ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
+  ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
 }
 accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
-  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
+  ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
 }
 accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
-  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
+  ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
 }
 #endif
@@ -151,9 +152,8 @@ accelerator_inline void convertType(T & out, const T & in) {
 template<typename T1,typename T2>
 accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
-  auto out_v = out.AcceleratorView(ViewWrite);
+  autoView( out_v , out,AcceleratorWrite);
-  auto in_v  = in.AcceleratorView(ViewRead);
+  autoView( in_v  , in ,AcceleratorRead);
  accelerator_for(ss,out_v.size(),T1::Nsimd(),{
      convertType(out_v[ss],in_v(ss));
  });
@@ -164,19 +164,20 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj>
 inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
+-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
 {
-  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  autoView( lhs_v , lhs, AcceleratorRead);
-  auto rhs_v = rhs.AcceleratorView(ViewRead);
+  autoView( rhs_v , rhs, AcceleratorRead);
  typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
  Lattice<iScalar<t_inner>> ret(lhs.Grid());
  auto ret_v = ret.AcceleratorView(ViewWrite);
  {
    autoView(ret_v, ret,AcceleratorWrite);
    accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
      convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
    });
-
+  }
  return ret;
 }
@@ -194,9 +195,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
  Lattice<iScalar<CComplex>> ip(coarse);
  Lattice<vobj>     fineDataRed = fineData;
-  //  auto fineData_   = fineData.View();
+  autoView( coarseData_ , coarseData, AcceleratorWrite);
-  auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
+  autoView( ip_         , ip,         AcceleratorWrite);
  auto ip_         = ip.AcceleratorView(ViewReadWrite);
  for(int v=0;v<nbasis;v++) {
    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
@@ -210,68 +210,6 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
  }
 }
 template<class vobj,class CComplex,int nbasis>
 inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 const             Lattice<vobj>   &fineData,
 			 const std::vector<Lattice<vobj> > &Basis)
 {
  typedef iVector<CComplex,nbasis > coarseSiteData;
  coarseSiteData elide;
  typedef decltype(coalescedRead(elide)) ScalarComplex;
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  int  _ndimension = coarse->_ndimension;
  // checks
  assert( nbasis == Basis.size() );
  subdivides(coarse,fine); 
  for(int i=0;i<nbasis;i++){
    conformable(Basis[i],fineData);
  }
  Coordinate block_r      (_ndimension);
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
    assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
  }
  int blockVol = fine->oSites()/coarse->oSites();
  coarseData=Zero();
  auto fineData_   = fineData.View();
  auto coarseData_ = coarseData.View();
  ////////////////////////////////////////////////////////////////////////////////////////////////////////
  // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
  // Otherwise do fine inner product per site, and make the update atomic
  ////////////////////////////////////////////////////////////////////////////////////////////////////////
  accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
    auto sc=sci/nbasis;
    auto i=sci%nbasis;
    auto Basis_      = Basis[i].View();
    Coordinate coor_c(_ndimension);
    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
    int sf;
    decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
    for(int sb=0;sb<blockVol;sb++){
      Coordinate coor_b(_ndimension);
      Coordinate coor_f(_ndimension);
      Lexicographic::CoorFromIndex(coor_b,sb,block_r);
      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
      reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
    }
    coalescedWrite(coarseData_[sc](i),reduce);
  });
  return;
 }
 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -298,10 +236,12 @@ template<class vobj,class vobj2,class CComplex>
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
  }
-  auto fineZ_  = fineZ.AcceleratorView(ViewWrite);
+  autoView( fineZ_  , fineZ, AcceleratorWrite);
-  auto fineX_  = fineX.AcceleratorView(ViewRead);
+  autoView( fineX_  , fineX, AcceleratorRead);
-  auto fineY_  = fineY.AcceleratorView(ViewRead);
+  autoView( fineY_  , fineY, AcceleratorRead);
-  auto coarseA_= coarseA.AcceleratorView(ViewRead);
+  autoView( coarseA_, coarseA, AcceleratorRead);
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;
  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@@ -309,12 +249,12 @@ template<class vobj,class vobj2,class CComplex>
      Coordinate coor_c(_ndimension);
      Coordinate coor_f(_ndimension);
-      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+      Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+      Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
      // z = A x + y
-#ifdef __CUDA_ARCH__
+#ifdef GRID_SIMT
      typename vobj2::tensor_reduced::scalar_object cA;
      typename vobj::scalar_object cAx;
 #else
@@ -344,15 +284,16 @@ template<class vobj,class CComplex>
  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
  Lattice<dotp> coarse_inner(coarse);
  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
  // Precision promotion
-  fine_inner = localInnerProductD(fineX,fineY);
+  fine_inner = localInnerProductD<vobj>(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
  {
    autoView( CoarseInner_  , CoarseInner,AcceleratorWrite);
    autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
    accelerator_for(ss, coarse->oSites(), 1, {
      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
    });
  }
 }
@@ -370,15 +311,16 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
  Lattice<dotp> coarse_inner(coarse);
  // Precision promotion?
  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
  {
    autoView( CoarseInner_  , CoarseInner, AcceleratorWrite);
    autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
    accelerator_for(ss, coarse->oSites(), 1, {
 	CoarseInner_[ss] = coarse_inner_[ss];
    });
  }
 }
 template<class vobj,class CComplex>
 inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
@@ -408,14 +350,19 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  }
  int blockVol = fine->oSites()/coarse->oSites();
-  auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
+  // Turn this around to loop threaded over sc and interior loop 
-  auto fineData_   = fineData.AcceleratorView(ViewRead);
+  // over sf would thread better
  autoView( coarseData_ , coarseData, AcceleratorWrite);
  autoView( fineData_   , fineData, AcceleratorRead);
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;
  accelerator_for(sc,coarse->oSites(),1,{
      // One thread per sub block
      Coordinate coor_c(_ndimension);
-      Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
      coarseData_[sc]=Zero();
      for(int sb=0;sb<blockVol;sb++){
@@ -425,7 +372,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 	Coordinate coor_f(_ndimension);
 	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
-	Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
 	coarseData_[sc]=coarseData_[sc]+fineData_[sf];
      }
@@ -510,8 +457,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
-  auto fineData_   = fineData.View();
+  autoView( fineData_   , fineData, AcceleratorWrite);
-  auto coarseData_ = coarseData.View();
+  autoView( coarseData_ , coarseData, AcceleratorRead);
  // Loop with a cache friendly loop ordering
  accelerator_for(sf,fine->oSites(),1,{
@@ -524,7 +471,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
    for(int i=0;i<nbasis;i++) {
-      auto basis_ = Basis[i].View();
+      /*      auto basis_ = Basis[i],  );*/
      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
    }
@@ -543,7 +490,14 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  fineData=Zero();
  for(int i=0;i<nbasis;i++) {
    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
-    auto  ip_ =  ip.AcceleratorView(ViewRead);
+
    //Lattice<CComplex> cip(coarse);
    //autoView( cip_ , cip, AcceleratorWrite);
    //autoView(  ip_ ,  ip, AcceleratorRead);
    //accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
    //	coalescedWrite(cip_[sc], ip_(sc)());
    //  });
    //blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
    blockZAXPY(fineData,ip,Basis[i],fineData);
  }
 }
@@ -571,15 +525,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->lSites() == og->lSites());
  }
  autoView(in_v,in,CpuRead);
  autoView(out_v,out,CpuWrite);
  thread_for(idx, ig->lSites(),{
    sobj s;
    ssobj ss;
    Coordinate lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(s,in,lcoor);
+    peekLocalSite(s,in_v,lcoor);
    ss=s;
-    pokeLocalSite(ss,out,lcoor);
+    pokeLocalSite(ss,out_v,lcoor);
  });
 }
@@ -614,8 +570,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate rdt = Tg->_rdimensions;
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;
-  auto t_v = To.AcceleratorView(ViewWrite);
+
-  auto f_v = From.AcceleratorView(ViewRead);
+  autoView( t_v , To, AcceleratorWrite);
  autoView( f_v , From, AcceleratorRead);
  accelerator_for(idx,Fg->lSites(),1,{
    sobj s;
    Coordinate Fcoor(nd);
@@ -638,8 +595,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      for(int w=0;w<words;w++){
 	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
      }
      //      peekLocalSite(s,From,Fcoor);
      //      pokeLocalSite(s,To  ,Tcoor);
    }
  });
 }
@@ -670,6 +625,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
  }
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
  thread_for(idx,lg->lSites(),{
    sobj s;
    Coordinate lcoor(nl);
@@ -682,8 +639,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
 	hcoor[d]=lcoor[ddl++];
      }
    }
-    peekLocalSite(s,lowDim,lcoor);
+    peekLocalSite(s,lowDimv,lcoor);
-    pokeLocalSite(s,higherDim,hcoor);
+    pokeLocalSite(s,higherDimv,hcoor);
  });
 }
@@ -711,6 +668,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
    }
  }
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuWrite);
  autoView(higherDimv,higherDim,CpuRead);
  thread_for(idx,lg->lSites(),{
    sobj s;
    Coordinate lcoor(nl);
@@ -723,8 +682,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 	hcoor[d]=lcoor[ddl++];
      }
    }
-    peekLocalSite(s,higherDim,hcoor);
+    peekLocalSite(s,higherDimv,hcoor);
-    pokeLocalSite(s,lowDim,lcoor);
+    pokeLocalSite(s,lowDimv,lcoor);
  });
 }
@@ -752,6 +711,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
  }
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
  thread_for(idx,lg->lSites(),{
    sobj s;
    Coordinate lcoor(nl);
@@ -760,8 +721,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
    if( lcoor[orthog] == slice_lo ) { 
      hcoor=lcoor;
      hcoor[orthog] = slice_hi;
-      peekLocalSite(s,lowDim,lcoor);
+      peekLocalSite(s,lowDimv,lcoor);
-      pokeLocalSite(s,higherDim,hcoor);
+      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
 }
@@ -789,6 +750,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
  }
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuWrite);
  autoView(higherDimv,higherDim,CpuRead);
  thread_for(idx,lg->lSites(),{
    sobj s;
    Coordinate lcoor(nl);
@@ -797,8 +760,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
    if( lcoor[orthog] == slice_lo ) { 
      hcoor=lcoor;
      hcoor[orthog] = slice_hi;
-      peekLocalSite(s,higherDim,hcoor);
+      peekLocalSite(s,higherDimv,hcoor);
-      pokeLocalSite(s,lowDim,lcoor);
+      pokeLocalSite(s,lowDimv,lcoor);
    }
  });
 }
@@ -862,7 +825,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
  }
  //loop over outer index
-  auto in_v  = in.View();
+  autoView( in_v  , in, CpuRead);
  thread_for(in_oidx,in_grid->oSites(),{
    //Assemble vector of pointers to output elements
    ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@@ -955,7 +918,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
    icoor[lane].resize(ndim);
    grid->iCoorFromIindex(icoor[lane],lane);
  }
-  auto out_v = out.View();
+  autoView( out_v , out, CpuWrite);
  thread_for(oidx, grid->oSites(),{
    //Assemble vector of pointers to output elements
    ExtractPointerArray<sobj> ptrs(nsimd);
@@ -1058,7 +1021,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
-  auto out_v = out.View();
+  autoView( out_v , out, CpuWrite);
  thread_for(out_oidx,out_grid->oSites(),{
    Coordinate out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
@@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
 template<class vobj>
 inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
-  auto ret_v = ret.View();
+  autoView( ret_v, ret, AcceleratorWrite);
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
    coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
  });
@@ -58,8 +58,8 @@ template<int Index,class vobj>
 inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
 {
  Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
-  auto ret_v = ret.View();
+  autoView( ret_v, ret, AcceleratorWrite);
-  auto lhs_v = lhs.View();
+  autoView( lhs_v, lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
    coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
  });
@@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
 template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
  Lattice<obj> ret_i(rhs_i.Grid());
-  auto rhs = rhs_i.View();
+  autoView( rhs, rhs_i, AcceleratorRead);
-  auto ret = ret_i.View();
+  autoView( ret, ret_i, AcceleratorWrite);
  ret.Checkerboard() = rhs.Checkerboard();
  accelerator_for(ss,rhs.size(),1,{
      ret[ss]=pow(rhs[ss],y);
@@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
 }
 template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
  Lattice<obj> ret_i(rhs_i.Grid());
-  auto rhs = rhs_i.View();
+  autoView( rhs , rhs_i, AcceleratorRead);
-  auto ret = ret_i.View();
+  autoView( ret , ret_i, AcceleratorWrite);
  ret.Checkerboard() = rhs.Checkerboard();
  accelerator_for(ss,rhs.size(),obj::Nsimd(),{
    coalescedWrite(ret[ss],mod(rhs(ss),y));
@@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
 template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
  Lattice<obj> ret_i(rhs_i.Grid());
-  auto ret = ret_i.View();
+  autoView( ret , ret_i, AcceleratorWrite);
-  auto rhs = rhs_i.View();
+  autoView( rhs , rhs_i, AcceleratorRead);
  ret.Checkerboard() = rhs_i.Checkerboard();
  accelerator_for(ss,rhs.size(),obj::Nsimd(),{
    coalescedWrite(ret[ss],div(rhs(ss),y));
@@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
 template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
  Lattice<obj> ret_i(rhs_i.Grid());
-  auto rhs = rhs_i.View();
+  autoView( rhs , rhs_i, AcceleratorRead);
-  auto ret = ret_i.View();
+  autoView( ret , ret_i, AcceleratorWrite);
  ret.Checkerboard() = rhs.Checkerboard();
  accelerator_for(ss,rhs.size(),obj::Nsimd(),{
    coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
@@ -0,0 +1,168 @@
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////
 // Base class which can be used by traits to pick up behaviour
 ///////////////////////////////////////////////////////////////////
 class LatticeBase {};
 /////////////////////////////////////////////////////////////////////////////////////////
 // Conformable checks; same instance of Grid required
 /////////////////////////////////////////////////////////////////////////////////////////
 void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
 {
  assert(lhs == rhs);
 }
 ////////////////////////////////////////////////////////////////////////////
 // Minimal base class containing only data valid to access from accelerator
 // _odata will be a managed pointer in CUDA
 ////////////////////////////////////////////////////////////////////////////
 // Force access to lattice through a view object.
 // prevents writing of code that will not offload to GPU, but perhaps annoyingly
 // strict since host could could in principle direct access through the lattice object
 // Need to decide programming model.
 #define LATTICE_VIEW_STRICT
 template<class vobj> class LatticeAccelerator : public LatticeBase
 {
 protected:
  //public:
  GridBase *_grid;
  int checkerboard;
  vobj     *_odata;    // A managed pointer
  uint64_t _odata_size;    
  ViewAdvise advise;
 public:
  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { }; 
  accelerator_inline uint64_t oSites(void) const { return _odata_size; };
  accelerator_inline int  Checkerboard(void) const { return checkerboard; };
  accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
  accelerator_inline ViewAdvise Advise(void) const { return advise; };
  accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
  accelerator_inline void Conformable(GridBase * &grid) const
  { 
    if (grid) conformable(grid, _grid);
    else      grid = _grid;
  };
  // Host only
  GridBase * getGrid(void) const { return _grid; };
 };
 /////////////////////////////////////////////////////////////////////////////////////////
 // A View class which provides accessor to the data.
 // This will be safe to call from accelerator_for and is trivially copy constructible
 // The copy constructor for this will need to be used by device lambda functions
 /////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> 
 class LatticeView : public LatticeAccelerator<vobj>
 {
 public:
  // Rvalue
  ViewMode mode;
  void * cpu_ptr;
 #ifdef GRID_SIMT
  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { 
    return coalescedRead(this->_odata[i]); 
  }
 #else 
  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 #endif
  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
  accelerator_inline uint64_t begin(void) const { return 0;};
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
  LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
  {
    this->ViewOpen(mode);
  }
  // Host functions
  void ViewOpen(ViewMode mode)
  { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
    //    std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
    this->cpu_ptr = (void *)this->_odata;
    this->mode    = mode;
    this->_odata  =(vobj *)
      MemoryManager::ViewOpen(this->cpu_ptr,
 				this->_odata_size*sizeof(vobj),
 				mode,
 				this->advise);    
  }
  void ViewClose(void)
  { // Inform the manager
    //    std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
  }
 };
 // Little autoscope assister
 template<class View> 
 class ViewCloser
 {
  View v;  // Take a copy of view and call view close when I go out of scope automatically
 public:
  ViewCloser(View &_v) : v(_v) {};
  ~ViewCloser() { v.ViewClose(); }
 };
 #define autoView(l_v,l,mode)				\
 	  auto l_v = l.View(mode);			\
 	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
 /////////////////////////////////////////////////////////////////////////////////////////
 // Lattice expression types used by ET to assemble the AST
 // 
 // Need to be able to detect code paths according to the whether a lattice object or not
 // so introduce some trait type things
 /////////////////////////////////////////////////////////////////////////////////////////
 class LatticeExpressionBase {};
 template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
 template<class T>                 struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
 template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
 template <typename Op, typename _T1>                           
 class LatticeUnaryExpression : public  LatticeExpressionBase 
 {
 public:
  typedef typename ViewMap<_T1>::Type T1;
  Op op;
  T1 arg1;
  LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
 };
 template <typename Op, typename _T1, typename _T2>              
 class LatticeBinaryExpression : public LatticeExpressionBase 
 {
 public:
  typedef typename ViewMap<_T1>::Type T1;
  typedef typename ViewMap<_T2>::Type T2;
  Op op;
  T1 arg1;
  T2 arg2;
  LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
 };
 template <typename Op, typename _T1, typename _T2, typename _T3> 
 class LatticeTrinaryExpression : public LatticeExpressionBase 
 {
 public:
  typedef typename ViewMap<_T1>::Type T1;
  typedef typename ViewMap<_T2>::Type T2;
  typedef typename ViewMap<_T3>::Type T3;
  Op op;
  T1 arg1;
  T2 arg2;
  T3 arg3;
  LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
 };
 NAMESPACE_END(Grid);
@@ -130,6 +130,8 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
      std::ios_base::fmtflags f(stream.flags());
      stream << log.background()<<  std::left;
      if (log.topWidth > 0)
      {
@@ -152,6 +154,8 @@ public:
 	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
      stream.flags(f);
      return stream;
    } else { 
      return devnull;
@@ -1,3 +1,4 @@
 #include <Grid/GridCore.h>
 int                    Grid::BinaryIO::latticeWriteMaxRetry = -1;
 Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
@@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
 public:
  struct IoPerf
  {
    uint64_t size{0},time{0};
    double   mbytesPerSecond{0.};
  };
  static IoPerf lastPerf;
  static int latticeWriteMaxRetry;
  /////////////////////////////////////////////////////////////////////////////
@@ -502,12 +509,15 @@ class BinaryIO {
      timer.Stop();
    }
    lastPerf.size            = sizeof(fobj)*iodata.size()*nrank;
    lastPerf.time            = timer.useconds();
    lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+    std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
+	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;
@@ -663,10 +673,15 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);
    timer.Start();
-    thread_for(lidx,lsites,{
+    thread_for(lidx,lsites,{  // FIX ME, suboptimal implementation
      std::vector<RngStateType> tmp(RngStateCount);
      std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
-      parallel_rng.SetState(tmp,lidx);
+      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(lidx, lcoor);
      int o_idx=grid->oIndex(lcoor);
      int i_idx=grid->iIndex(lcoor);
      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
      parallel_rng.SetState(tmp,gidx);
      });
    timer.Stop();
@@ -723,7 +738,12 @@ class BinaryIO {
    std::vector<RNGstate> iodata(lsites);
    thread_for(lidx,lsites,{
      std::vector<RngStateType> tmp(RngStateCount);
-      parallel_rng.GetState(tmp,lidx);
+      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(lidx, lcoor);
      int o_idx=grid->oIndex(lcoor);
      int i_idx=grid->iIndex(lcoor);
      int gidx=parallel_rng.generator_idx(o_idx,i_idx);
      parallel_rng.GetState(tmp,gidx);
      std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
    });
    timer.Stop();
@@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <sys/syscall.h>
 #endif
 #ifdef __x86_64__
-#ifdef GRID_NVCC
+#ifdef GRID_CUDA
 accelerator_inline uint64_t __rdtsc(void) {  return 0; }
 accelerator_inline uint64_t __rdpmc(int ) {  return 0; }
 #else
@@ -112,7 +112,6 @@ class PerformanceCounter {
 private:
  typedef struct { 
  public:
    uint32_t type;
    uint64_t config;
    const char *name;
@@ -12773,7 +12773,7 @@ namespace pugi
 #undef PUGI__THROW_ERROR
 #undef PUGI__CHECK_ERROR
-#ifdef GRID_NVCC
+#ifdef GRID_CUDA
 #pragma pop
 #endif
@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
 static constexpr int Zm = 6;
 static constexpr int Tm = 7;
-static constexpr int Nc=3;
+static constexpr int Nc=Config_Nc;
 static constexpr int Ns=4;
 static constexpr int Nd=4;
 static constexpr int Nhs=2; // half spinor
@@ -115,18 +115,21 @@ public:
      PokeIndex<LorentzIndex>(Uadj, U, mu);
    }
-    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+    autoView(Umu_v,Umu,CpuRead);
    autoView(Uadj_v,Uadj,CpuRead);
    autoView(Uds_v,Uds,CpuWrite);
    thread_for( lidx, GaugeGrid->lSites(), {
      Coordinate lcoor;
      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
-      peekLocalSite(ScalarUmu, Umu, lcoor);
+      peekLocalSite(ScalarUmu, Umu_v, lcoor);
      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
-      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      peekLocalSite(ScalarUmu, Uadj_v, lcoor);
      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
-      pokeLocalSite(ScalarUds, Uds, lcoor);
+      pokeLocalSite(ScalarUds, Uds_v, lcoor);
-    }
+    });
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
@@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
 #include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 NAMESPACE_CHECK(Staggered);
@@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
@@ -96,11 +96,11 @@ public:
    int sl        = St._simd_layout[direction];
    Coordinate icoor;
-#ifdef __CUDA_ARCH__
+#ifdef GRID_SIMT
    _Spinor tmp;
    const int Nsimd =SiteDoubledGaugeField::Nsimd();
-    int s = SIMTlane(Nsimd);
+    int s = acceleratorSIMTlane(Nsimd);
    St.iCoorFromIindex(icoor,s);
    int mmu = mu % Nd;
@@ -233,14 +233,16 @@ public:
 	Uconj = where(coor==neglink,-Uconj,Uconj);
      }
-      auto U_v = U.View();
+      {
-      auto Uds_v = Uds.View();
+	autoView( U_v , U, CpuRead);
-      auto Uconj_v = Uconj.View();
+	autoView( Uconj_v , Uconj, CpuRead);
-      auto Utmp_v= Utmp.View();
+	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
 	  });
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
      Uconj = adj(Cshift(Uconj,mu,-1));
@@ -250,19 +252,25 @@ public:
 	Utmp = where(coor==0,Uconj,Utmp);
      }
      {
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
 	    Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
 	  });
-          
+      }
      Utmp = Uconj;
      if ( Params.twists[mu] ) { 
 	Utmp = where(coor==0,U,Utmp);
      }
      {	  
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
 	    Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
        });
-          
+      }
    }
  }
@@ -272,11 +280,14 @@ public:
    GaugeLinkField link(mat.Grid());
    // use lorentz for flavour as hack.
    auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
-    auto link_v = link.View();
+
-    auto tmp_v = tmp.View();
+    {
      autoView( link_v , link, CpuWrite);
      autoView( tmp_v , tmp, CpuRead);
      thread_foreach(ss,tmp_v,{
 	  link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
 	});
    }
    PokeIndex<LorentzIndex>(mat, link, mu);
    return;
  }
@@ -306,9 +317,10 @@ public:
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
-    auto tmp_v = tmp.View();
+    {
-    auto Atilde_v = Atilde.View();
+      autoView( tmp_v , tmp, CpuWrite);
-    auto Btilde_v = Btilde.View();
+      autoView( Atilde_v , Atilde, CpuRead);
      autoView( Btilde_v , Btilde, CpuRead);
      thread_for(ss,tmp.Grid()->oSites(),{
 	  for (int s = 0; s < Ls; s++) {
 	    int sF = s + Ls * ss;
@@ -316,6 +328,7 @@ public:
 	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
 	  }
 	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
@@ -208,7 +208,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
@@ -0,0 +1,194 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #ifndef GRID_QCD_NAIVE_STAG_FERMION_H
 #define GRID_QCD_NAIVE_STAG_FERMION_H
 NAMESPACE_BEGIN(Grid);
 class NaiveStaggeredFermionStatic {
 public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 8;
 };
 template <class Impl>
 class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
 public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
  GridBase *GaugeGrid(void) { return _grid; }
  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
  void M(const FermionField &in, FermionField &out);
  void Mdag(const FermionField &in, FermionField &out);
  /////////////////////////////////////////////////////////
  // half checkerboard operations
  /////////////////////////////////////////////////////////
  void Meooe(const FermionField &in, FermionField &out);
  void MeooeDag(const FermionField &in, FermionField &out);
  void Mooee(const FermionField &in, FermionField &out);
  void MooeeDag(const FermionField &in, FermionField &out);
  void MooeeInv(const FermionField &in, FermionField &out);
  void MooeeInvDag(const FermionField &in, FermionField &out);
  ////////////////////////
  // Derivative interface
  ////////////////////////
  // Interface calls an internal routine
  void DhopDeriv  (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  ///////////////////////////////////////////////////////////////
  // non-hermitian hopping term; half cb or both
  ///////////////////////////////////////////////////////////////
  void Dhop  (const FermionField &in, FermionField &out, int dag);
  void DhopOE(const FermionField &in, FermionField &out, int dag);
  void DhopEO(const FermionField &in, FermionField &out, int dag);
  ///////////////////////////////////////////////////////////////
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
  ///////////////////////////////////////////////////////////////
  void DerivInternal(StencilImpl &st, 
 		     DoubledGaugeField &U,
 		     GaugeField &mat, 
 		     const FermionField &A, const FermionField &B, int dag);
  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 			       const FermionField &in, FermionField &out, int dag);
  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 				   const FermionField &in, FermionField &out, int dag);
  //////////////////////////////////////////////////////////////////////////
  // Grid own interface Constructor
  //////////////////////////////////////////////////////////////////////////
  NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
 			GridRedBlackCartesian &Hgrid, RealD _mass,
 			RealD _c1, RealD _u0,
 			const ImplParams &p = ImplParams());
  NaiveStaggeredFermion(GridCartesian &Fgrid,
 			GridRedBlackCartesian &Hgrid, RealD _mass,
 			RealD _c1, RealD _u0,
 			const ImplParams &p = ImplParams());
  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_U );
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
  //    protected:
 public:
  // any other parameters of action ???
  virtual int   isTrivialEE(void) { return 1; };
  virtual RealD Mass(void) { return mass; }
  RealD mass;
  RealD u0;
  RealD c1;
  GridBase *_grid;
  GridBase *_cbgrid;
  // Defines the stencils for even and odd
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
                                PropagatorField &src,
                                Current curr_type,
                                unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
                           PropagatorField &srct,
                           Current curr_type,
                           unsigned int mu, 
                           unsigned int tmin,
                           unsigned int tmax,
 			   ComplexField &lattice_cmplx);
 };
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 NAMESPACE_END(Grid);
 #endif
@@ -49,21 +49,35 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
 public:
  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeField &U,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
  void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 		     int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
 protected:    
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+   template<int Naik> 
   static accelerator_inline
   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+   
   template<int Naik> static accelerator_inline
   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+   
   template<int Naik> static accelerator_inline
   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -71,15 +85,21 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+   
   template<int Naik> static accelerator_inline
   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+   
   template<int Naik> static accelerator_inline
   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+   
   template<int Naik> static accelerator_inline
   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -87,27 +107,11 @@ public:
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+   
   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
 		    const FermionFieldView &in, FermionFieldView &out,int dag);
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   // Generic interface; fan out to right routine
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
 		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
 		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU,
 		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
 		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
 public:
@@ -113,20 +113,7 @@ public:
  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
  {
-    GridBase *GaugeGrid = U_ds.Grid();
+    assert(0);
    thread_for(lidx, GaugeGrid->lSites(),{
 	SiteScalarGaugeLink   ScalarU;
 	SiteDoubledGaugeField ScalarUds;
 	Coordinate lcoor;
 	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
 	peekLocalSite(ScalarUds, U_ds, lcoor);
 	peekLocalSite(ScalarU, U, lcoor);
 	ScalarUds(mu) = ScalarU();
    });
  }
  inline void DoubleStore(GridBase *GaugeGrid,
 			  DoubledGaugeField &UUUds, // for Naik term
@@ -257,15 +257,16 @@ private:
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
-    auto T_v = T.View();
+    autoView(T_v,T,AcceleratorWrite);
-    auto F_v = F.View();
+    autoView(F_v,F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@@ -281,9 +282,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    auto T_v = T.View();
+    autoView(T_v, T,AcceleratorWrite);
-    auto F_v = F.View();
+    autoView(F_v, F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
@@ -299,9 +300,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    auto T_v = T.View();
+    autoView(T_v,T,AcceleratorWrite);
-    auto F_v = F.View();
+    autoView(F_v,F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
@@ -317,9 +318,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    auto T_v = T.View();
+    autoView( T_v , T, AcceleratorWrite);
-    auto F_v = F.View();
+    autoView( F_v , F, AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
@@ -335,9 +336,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    auto T_v = T.View();
+    autoView( T_v ,T,AcceleratorWrite);
-    auto F_v = F.View();
+    autoView( F_v ,F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
@@ -354,9 +355,9 @@ private:
    T = Zero();
-    auto T_v = T.View();
+    autoView( T_v , T,AcceleratorWrite);
-    auto F_v = F.View();
+    autoView( F_v , F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
@@ -74,6 +74,20 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
@@ -196,5 +210,3 @@ typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 NAMESPACE_END(Grid);
@@ -215,7 +215,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
  // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 };
@@ -106,10 +106,10 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
  {
-    auto out_v= out.View();
+    autoView( out_v, out, AcceleratorWrite);
-    auto phi_v= phi.View();
+    autoView( phi_v, phi, AcceleratorRead);
-    auto Umu_v= Umu.View();
+    autoView( Umu_v, Umu, AcceleratorRead);
-    thread_for(sss,out.Grid()->oSites(),{
+    accelerator_for(sss,out.Grid()->oSites(),1,{
 	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
    });
  }
@@ -191,18 +191,19 @@ public:
    int Ls=Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
-    auto tmp_v = tmp.View();
+    {
-    auto Btilde_v = Btilde.View();
+      autoView( tmp_v , tmp, AcceleratorWrite);
-    auto Atilde_v = Atilde.View();
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-    thread_for(sss,tmp.Grid()->oSites(),{
+      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,tmp.Grid()->oSites(),1,{
 	  int sU=sss;
 	  for(int s=0;s<Ls;s++){
 	    int sF = s+Ls*sU;
 	    tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
 	  }
 	});
    }
    PokeIndex<LorentzIndex>(mat,tmp,mu);
  }
 };
@@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-#ifdef GRID_NVCC
+#ifdef GRID_CUDA
    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
@@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-#ifndef GRID_NVCC
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for (int s=0;s<Ls;s++) {
@@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif
-#ifndef GRID_NVCC
+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
@@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField  zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for(int s=0;s<Ls;s++){
@@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
-  auto phi = phi_i.View();
+  autoView(phi , phi_i,AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
@@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 {
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
-  auto phi = phi_i.View();
+  autoView(phi , phi_i,AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i,AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
@@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i,AcceleratorWrite);
  int Ls=this->Ls;
@@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  GridBase *grid=psi_i.Grid();
  int Ls=this->Ls;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i,AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i,AcceleratorWrite);
  auto plee  = & lee [0];
  auto pdee  = & dee [0];
@@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi = psi_i.View();
+  autoView(psi, psi_i,CpuRead);
-  auto phi = phi_i.View();
+  autoView(phi, phi_i,CpuRead);
-  auto chi = chi_i.View();
+  autoView(chi, chi_i,CpuWrite);
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  const int nsimd= Simd::Nsimd();
@@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
  chi_i.Checkerboard()=psi_i.Checkerboard();
  GridBase *grid=psi_i.Grid();
-  auto psi=psi_i.View();
+  autoView(psi,psi_i,CpuRead);
-  auto phi=phi_i.View();
+  autoView(phi,phi_i,CpuRead);
-  auto chi=chi_i.View();
+  autoView(chi,chi_i,CpuWrite);
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  int nsimd= Simd::Nsimd();
@@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
 					Vector<iSinglet<Simd> > &Matm)
 {
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i,CpuRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i,CpuWrite);
 #ifndef AVX512
  {
    SiteHalfSpinor BcastP;
@@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
  EnableIf<Impl::LsVectorised,int> sfinae=0;
 #ifndef AVX512
  {
-    auto psi = psi_i.View();
+    autoView(psi , psi_i,CpuRead);
-    auto chi = chi_i.View();
+    autoView(chi , chi_i,CpuWrite);
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
@@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
  }
 #else
  {
-    auto psi = psi_i.View();
+    autoView(psi , psi_i,CpuRead);
-    auto chi = chi_i.View();
+    autoView(chi , chi_i,CpuWrite);
    // pointers
    //  MASK_REGS;
 #define Chi_00 %zmm0
@@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  chi_i.Checkerboard() = psi_i.Checkerboard();
  int Ls = this->Ls;
  GridBase* grid = psi_i.Grid();
-  auto phi = phi_i.View();
+  autoView( phi , phi_i, AcceleratorRead);
-  auto psi = psi_i.View();
+  autoView( psi , psi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
@@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  GridBase* grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
+  autoView( psi , psi_i, AcceleratorRead);
-  auto phi = phi_i.View();
+  autoView( phi , phi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView( chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
  auto pdiag = &diag[0];
  auto pupper = &upper[0];
@@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
-  auto psi=psi_i.View();
+  autoView( psi, psi_i, AcceleratorRead);
-  auto chi=chi_i.View();
+  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;
  auto plee  = & this->lee[0];
@@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase* grid = psi_i.Grid();
-  auto psi = psi_i.View();
+  autoView( psi, psi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView( chi, chi_i, AcceleratorWrite);
  int Ls = this->Ls;
  auto plee  = & this->lee[0];
@@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
  Compressor compressor;
  Stencil.HaloExchange(in,compressor);
-  auto Umu_v   = Umu.View();
+  autoView( Umu_v   ,   Umu, CpuRead);
-  auto UUUmu_v = UUUmu.View();
+  autoView( UUUmu_v , UUUmu, CpuRead);
-  auto in_v    = in.View();
+  autoView( in_v    ,  in, CpuRead);
-  auto out_v   = out.View();
+  autoView( out_v   , out, CpuWrite);
  thread_for( ss,Umu.Grid()->oSites(),{
    for(int s=0;s<Ls;s++){
      int sU=ss;
@@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 						    const FermionField &in, FermionField &out,int dag)
 {
 #ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
 #endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
@@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
 #ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor; 
  int LLs = in.Grid()->_rdimensions[0];
@@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime+=usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();
  double ctime=0;
  double ptime=0;
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
+  DhopComputeTime-=usecond();
  {
-    int tid = omp_get_thread_num();
+    int interior=1;
-    int nthreads = omp_get_num_threads();
+    int exterior=0;
-    int ncomms = CartesianCommunicator::nCommThreads;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      double start = usecond();
      nthreads -= ncomms;
      int ttid  = tid - ncomms;
      int n     = U.Grid()->oSites(); // 4d vol
      int chunk = n / nthreads;
      int rem   = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
        myblock = ttid * chunk + ttid;
        myn = chunk+1;
      } else {
        myblock = ttid*chunk + rem;
        myn = chunk;
  }
-
+  DhopComputeTime+=usecond();
      // do the compute
      auto   U_v  =   U.View();
      auto UUU_v  = UUU.View();
      auto  in_v  =  in.View();
      auto out_v  = out.View();
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
        }
      }
        ptime = usecond() - start;
    } else {
      double start = usecond();
      st.CommunicateThreaded();
      ctime = usecond() - start;
    }
  }
  DhopCommTime += ctime;
  DhopComputeTime+=ptime;
  // First to enter, last to leave timing
  st.CollateThreads();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
-  DhopComputeTime2-=usecond();
+  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
-  auto   U_v  =   U.View();
+  DhopComputeTime2-=usecond();
-  auto UUU_v  = UUU.View();
+  {
-  auto  in_v  =  in.View();
+    int interior=0;
-  auto out_v  = out.View();
+    int exterior=1;
-  if (dag == DaggerYes) {
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    int sz=st.surface_list.size();
    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
    });
  } else {
    int sz=st.surface_list.size();
    thread_for( ss,sz,{
      int sU = st.surface_list[ss];
      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
    });
  }
  DhopComputeTime2+=usecond();
 #else
  assert(0);
 #endif
 }
 template<class Impl>
@@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];
 //double t1=usecond();
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
@@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  auto   U_v  =   U.View();
+  {
-  auto UUU_v  = UUU.View();
+    int interior=1;
-  auto  in_v  =  in.View();
+    int exterior=1;
-  auto out_v  = out.View();
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  if (dag == DaggerYes) {
    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
    });
  } else {
    thread_for( ss,U.Grid()->oSites(),{
      int sU=ss;
      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 //double t2=usecond();
 //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
 //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
 //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
 //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
 }
 /*CHANGE END*/
@@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
    ////////////////////////
    // Call the single hop
    ////////////////////////
-    auto U_v   = U.View();
+    autoView( U_v   , U, CpuRead);
-    auto UUU_v = UUU.View();
+    autoView( UUU_v , UUU, CpuRead);
-    auto B_v   = B.View();
+    autoView( B_v      , B, CpuWrite);
-    auto Btilde_v   = Btilde.View();
+    autoView( Btilde_v , Btilde, CpuWrite);
    thread_for(sss,B.Grid()->oSites(),{
      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
    });
@@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
-  auto Umu_v   =   Umu.View();
+  autoView( Umu_v   ,   Umu, CpuRead);
-  auto UUUmu_v = UUUmu.View();
+  autoView( UUUmu_v , UUUmu, CpuRead);
-  auto in_v    =  in.View();
+  autoView( in_v    ,  in, CpuRead);
-  auto out_v   = out.View();
+  autoView( out_v   , out, CpuWrite);
  thread_for( sss, in.Grid()->oSites(),{
    Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
  });
@@ -403,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
 						  const FermionField &in,
 						  FermionField &out, int dag) 
 {
 #ifdef GRID_OMP
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
  else
 #endif
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 }
 template <class Impl>
@@ -417,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 								 const FermionField &in,
 								 FermionField &out, int dag) 
 {
 #ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();
@@ -426,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Ugly explicit thread mapping introduced for OPA reasons.
+  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
 #pragma omp parallel 
  {
-    int tid = omp_get_thread_num();
+    int interior=1;
-    int nthreads = omp_get_num_threads();
+    int exterior=0;
-    int ncomms = CartesianCommunicator::nCommThreads;
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      nthreads -= ncomms;
      int ttid  = tid - ncomms;
      int n     = len;
      int chunk = n / nthreads;
      int rem   = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
        myblock = ttid * chunk + ttid;
        myn = chunk+1;
      } else {
        myblock = ttid*chunk + rem;
        myn = chunk;
      }
      // do the compute
      auto U_v   = U.View();
      auto UUU_v = UUU.View();
      auto in_v  = in.View();
      auto out_v = out.View();
      if (dag == DaggerYes) {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
          int sU = ss;
 	  // Interior = 1; Exterior = 0; must implement for staggered
          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
        }
      } else {
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  // Interior = 1; Exterior = 0;
          int sU = ss;
          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
        }
      }
    } else {
      st.CommunicateThreaded();
    }
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
@@ -487,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  DhopComputeTime2    -= usecond();
  {
-    auto U_v   = U.View();
+    int interior=0;
-    auto UUU_v = UUU.View();
+    int exterior=1;
-    auto in_v  = in.View();
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
    auto out_v = out.View();
    if (dag == DaggerYes) {
      int sz=st.surface_list.size();
      thread_for(ss,sz,{
 	int sU = st.surface_list[ss];
 	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
      });
    } else {
      int sz=st.surface_list.size();
      thread_for(ss,sz,{
 	int sU = st.surface_list[ss];
 	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
      });
    }
  }
  DhopComputeTime2    += usecond();
 #else
  assert(0);
 #endif
 }
@@ -528,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  auto U_v   =   U.View();
  auto UUU_v = UUU.View();
  auto in_v  =  in.View();
  auto out_v = out.View();
  DhopComputeTime -= usecond();
-  if (dag == DaggerYes) {
+  {
-    thread_for(sss, in.Grid()->oSites(),{
+    int interior=1;
-      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
+    int exterior=1;
-    });
+    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  } else {
    thread_for(sss, in.Grid()->oSites(),{
      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
    });
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
@@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto phi = phi_i.View();
+  autoView(phi , phi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
@@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto phi = phi_i.View();
+  autoView(phi , phi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  auto pm  = this->pm;
  int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto phi = phi_i.View();
+  autoView(phi , phi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
@@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto phi = phi_i.View();
+  autoView(phi , phi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  assert(phi.Checkerboard() == psi.Checkerboard());
@@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  auto plee = & this->lee [0];
  auto pdee = & this->dee [0];
@@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  auto pm = this->pm;
  auto plee = & this->lee [0];
@@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
  int Ls = this->Ls;
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  auto plee = & this->lee [0];
  auto pdee = & this->dee [0];
@@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
 {
  chi_i.Checkerboard() = psi_i.Checkerboard();
  GridBase *grid = psi_i.Grid();
-  auto psi = psi_i.View();
+  autoView(psi , psi_i, AcceleratorRead);
-  auto chi = chi_i.View();
+  autoView(chi , chi_i, AcceleratorWrite);
  int Ls = this->Ls;
  auto pm = this->pm;
@@ -0,0 +1,499 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #pragma once 
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////
 // Constructor and gauge import
 /////////////////////////////////
 template <class Impl>
 NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
 						   RealD _mass,
 						   RealD _c1, RealD _u0,
 						   const ImplParams &p)
  : Kernels(p),
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
    Umu(&Fgrid),
    UmuEven(&Hgrid),
    UmuOdd(&Hgrid),
    _tmp(&Hgrid)
 {
  int vol4;
  int LLs=1;
  c1=_c1;
  u0=_u0;
  vol4= _grid->oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
  vol4= _cbgrid->oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
  StencilOdd.BuildSurfaceList(LLs,vol4);
 }
 template <class Impl>
 NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
 						   GridRedBlackCartesian &Hgrid, RealD _mass,
 						   RealD _c1, RealD _u0,
 						   const ImplParams &p)
  : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
 {
  ImportGauge(_U);
 }
 ////////////////////////////////////////////////////////////
 // Momentum space propagator should be 
 // https://arxiv.org/pdf/hep-lat/9712010.pdf
 //
 // mom space action.
 //   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
 //
 // must track through staggered flavour/spin reduction in literature to 
 // turn to free propagator for the one component chi field, a la page 4/5
 // of above link to implmement fourier based solver.
 ////////////////////////////////////////////////////////////
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
 {
  pickCheckerboard(Even, UmuEven,  Umu);
  pickCheckerboard(Odd,  UmuOdd ,  Umu);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U) 
 {
  GaugeLinkField U(GaugeGrid());
  DoubledGaugeField _UUU(GaugeGrid());
  ////////////////////////////////////////////////////////
  // Double Store should take two fields for Naik and one hop separately.
  // Discard teh Naik as Naive
  ////////////////////////////////////////////////////////
  Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
  ////////////////////////////////////////////////////////
  // Apply scale factors to get the right fermion Kinetic term
  // Could pass coeffs into the double store to save work.
  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
  ////////////////////////////////////////////////////////
  for (int mu = 0; mu < Nd; mu++) {
    U = PeekIndex<LorentzIndex>(Umu, mu);
    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
    U = PeekIndex<LorentzIndex>(Umu, mu+4);
    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
  }
  CopyGaugeCheckerboards();
 }
 /////////////////////////////
 // Implement the interface
 /////////////////////////////
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
  axpy(out, mass, in, out);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
  axpy(out, mass, in, out);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
  }
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
    DhopOE(in, out, DaggerYes);
  }
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
 {
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }
 ///////////////////////////////////
 // Internal
 ///////////////////////////////////
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
 						GaugeField & mat,
 						const FermionField &A, const FermionField &B, int dag) 
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor;
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());
  Atilde = A;
  st.HaloExchange(B, compressor);
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////
    // Call the single hop
    ////////////////////////
    autoView( U_v      , U, CpuRead);
    autoView( B_v      , B, CpuWrite);
    autoView( Btilde_v , Btilde, CpuWrite);
    thread_for(sss,B.Grid()->oSites(),{
      Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
    });
    assert(0);// need to figure out the force interface with a blasted three link term.
  }
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
  conformable(U.Grid(), _grid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
  mat.Checkerboard() = U.Checkerboard();
  DerivInternal(Stencil, Umu, mat, U, V, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
  assert(V.Checkerboard() == Even);
  assert(U.Checkerboard() == Odd);
  mat.Checkerboard() = Odd;
  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
  assert(V.Checkerboard() == Odd);
  assert(U.Checkerboard() == Even);
  mat.Checkerboard() = Even;
  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
  out.Checkerboard() = in.Checkerboard();
  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
  assert(in.Checkerboard() == Even);
  out.Checkerboard() = Odd;
  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
  assert(in.Checkerboard() == Odd);
  out.Checkerboard() = Even;
  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
  assert(0); // Not implemented yet
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
  autoView( Umu_v   ,  Umu, CpuRead);
  autoView( in_v    ,  in, CpuRead);
  autoView( out_v   , out, CpuWrite);
  //  thread_for( sss, in.Grid()->oSites(),{
  //    Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
  //  });
  assert(0);
 };
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 					       DoubledGaugeField &U,
 					       const FermionField &in,
 					       FermionField &out, int dag) 
 {
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
    DhopInternalSerialComms(st,lo,U,in,out,dag);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 							      DoubledGaugeField &U,
 							      const FermionField &in,
 							      FermionField &out, int dag) 
 {
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
 							  DoubledGaugeField &U,
 							  const FermionField &in,
 							  FermionField &out, int dag) 
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 							      PropagatorField &q_in_2,
 							      PropagatorField &q_out,
 							      PropagatorField &src,
 							      Current curr_type,
 							      unsigned int mu)
 {
  assert(0);
 }
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                         PropagatorField &q_out,
                                                         PropagatorField &src,
                                                         Current curr_type,
                                                         unsigned int mu, 
                                                         unsigned int tmin,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  assert(0);
 }
 NAMESPACE_END(Grid);
@@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
 					 DoubledGaugeFieldView &U,
 					 DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs,
+					 SiteSpinor *buf, int sF,
 					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  assert(0);
@@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;
-   for(int s=0;s<LLs;s++){
+  //   for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 }
 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
+								    SiteSpinor *buf, int sF,
 								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
@@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  StencilEntry *SE2;
  StencilEntry *SE3;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    
+  //    int sF=s+LLs*sU;
-    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
@@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 }
 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
-							       SiteSpinor *buf, int LLs,
+							       SiteSpinor *buf, int sF,
 							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
@@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
  StencilEntry *SE2;
  StencilEntry *SE3;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    
+  //    int sF=s+LLs*sU;
-    int sF=s+LLs*sU;
+  {
    // Xp, Yp, Zp, Tp
    PREPARE(Xp,Yp,Zp,Tp,0,U);
    LOAD_CHIa(addr0,addr1);
@@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
-					  SiteSpinor *buf, int LLs, int sU, 
+					  SiteSpinor *buf, int sF, int sU, 
 					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    skew = 0;
    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
@@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
@@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
-    
+    }    
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
      result()()(1) = - even_1 - odd_1;
@@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
@@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
-
+    }
    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
@@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
@@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  StencilEntry *SE;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
+  //    int sF=s+LLs*sU;
  {
    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
@@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
    if (Naik) {
    skew = 8;
    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
@@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
-
+    }
    // Add sum of all exterior connected stencil legs
    if ( nmu ) { 
      if ( dag ) {
@@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
  }
 }
 /*
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
-
+*/
 #undef LOAD_CHI
 NAMESPACE_END(Grid);
@@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					     SiteSpinor *buf, int LLs, int sU, 
+					     SiteSpinor *buf, int sF, int sU, 
-					     const FermionFieldView &in, FermionFieldView &out, int dag) {
+					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
@@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  int ptype;
  int skew;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //
  //    int sF=LLs*sU+s;
  {
    skew = 0;
    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
    }
    if ( dag ) { 
      Uchi = - Uchi;
    } 
@@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU, 
+						SiteSpinor *buf, int sF, int sU, 
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
@@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  int ptype;
  int skew ;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //    int sF=LLs*sU+s;
  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
    }
    if ( dag ) {
      Uchi = - Uchi;
    }
@@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-						SiteSpinor *buf, int LLs, int sU,
+						SiteSpinor *buf, int sF, int sU,
 						const FermionFieldView &in, FermionFieldView &out,int dag) {
  const SiteSpinor *chi_p;
  //  SiteSpinor chi;
@@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
  int nmu=0;
  int skew ;
-  for(int s=0;s<LLs;s++){
+  //  for(int s=0;s<LLs;s++){
-    int sF=LLs*sU+s;
+  //    int sF=LLs*sU+s;
  {
    skew = 0;
    Uchi=Zero();
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
    if ( Naik ) {
    skew=8;
    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
-
+    }
    if ( nmu ) { 
      if ( dag ) { 
 	out[sF] = out[sF] - Uchi;
@@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl> 
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
-					 SiteSpinor *buf, int LLs, int sU,
+					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 					 const FermionFieldView &in, FermionFieldView &out,
 					 int interior,int exterior)
 {
  int dag=1;
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs, int sU,
 				      const FermionFieldView &in, FermionFieldView &out,
 				      int interior,int exterior)
 {
  int dag=0;
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 				      SiteSpinor *buf, int LLs,
 				      int sU, const FermionFieldView &in, FermionFieldView &out,
 				      int dag,int interior,int exterior) 
 {
  switch(Opt) {
 #ifdef AVX512
  case OptInlineAsm:
    if ( interior && exterior ) {
      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else { 
      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
      assert(0);
    }
    break;
 #endif
  case OptHandUnroll:
    if ( interior && exterior ) {
      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( interior ) {
      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( exterior ) {
      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    }
    break;
  case OptGeneric:
    if ( interior && exterior ) {
      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( interior ) {
      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( exterior ) {
      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    }
    break;
  default:
    std::cout<<"Oops Opt = "<<Opt<<std::endl;
    assert(0);
    break;
  }
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
 					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
@@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
  assert(0);
 }
 #define KERNEL_CALLNB(A,improved)					\
  const uint64_t    NN = Nsite*Ls;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
      int sU = ss/Ls;							\
      ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
    });
 #define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); 
 #define ASM_CALL(A)							\
  const uint64_t    NN = Nsite*Ls;					\
  thread_for( ss, NN, {							\
      int sF = ss;							\
      int sU = ss/Ls;							\
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
  });
 template <class Impl> 
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  autoView( UUU_v , UUU, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
  autoView( out_v , out, AcceleratorWrite);
  autoView( st_v  ,  st, AcceleratorRead);
  SiteSpinor * buf = st.CommBuf();
  int Ls=1;
  if(FGrid->Nd()==UGrid->Nd()+1){
    Ls    = FGrid->_rdimensions[0];
  }
  int Nsite = UGrid->oSites();
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
 #endif
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
 template <class Impl> 
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
  autoView( UUU_v ,   U, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
  autoView( out_v , out, AcceleratorWrite);
  autoView( st_v  ,  st, AcceleratorRead);
  SiteSpinor * buf = st.CommBuf();
  int Ls=1;
  if(FGrid->Nd()==UGrid->Nd()+1){
    Ls    = FGrid->_rdimensions[0];
  }
  int Nsite = UGrid->oSites();
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
 #endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
 #ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
 #endif
  }
 }
 #undef KERNEL_CALLNB
 #undef KERNEL_CALL
 #undef ASM_CALL
 NAMESPACE_END(Grid);
@@ -98,11 +98,13 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  Coordinate lcoor;
  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
  for (int site = 0; site < lvol; site++)
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    for (int site = 0; site < lvol; site++) {
      grid->LocalIndexToLocalCoor(site, lcoor);
      EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-    peekLocalSite(Qx, CloverTerm, lcoor);
+      peekLocalSite(Qx, CTv, lcoor);
      Qxinv = Zero();
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
@@ -123,7 +125,8 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
-    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
+      pokeLocalSite(Qxinv, CTIv, lcoor);
    }
  }
  // Separate the even and odd parts
@@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  cosha = (one + W*W + sk) / (abs(W)*2.0);
  // FIXME Need a Lattice acosh
  {
    autoView(cosha_v,cosha,CpuRead);
    autoView(a_v,a,CpuWrite);
    for(int idx=0;idx<_grid->lSites();idx++){
      Coordinate lcoor(Nd);
      Tcomplex cc;
      //    RealD sgn;
      _grid->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(cc,cosha,lcoor);
+      peekLocalSite(cc,cosha_v,lcoor);
      assert((double)real(cc)>=1.0);
      assert(fabs((double)imag(cc))<=1.0e-15);
      cc = ScalComplex(::acosh(real(cc)),0.0);
-    pokeLocalSite(cc,a,lcoor);
+      pokeLocalSite(cc,a_v,lcoor);
    }
  }
  Wea = ( exp( a) * abs(W)  );
@@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
  cosha =  (one + W*W + sk) / (abs(W)*2.0);
  // FIXME Need a Lattice acosh
  {
  autoView(cosha_v,cosha,CpuRead);
  autoView(a_v,a,CpuWrite);
  for(int idx=0;idx<_grid->lSites();idx++){
    Coordinate lcoor(Nd);
    Tcomplex cc;
    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(cc,cosha,lcoor);
+    peekLocalSite(cc,cosha_v,lcoor);
    assert((double)real(cc)>=1.0);
    assert(fabs((double)imag(cc))<=1.0e-15);
    cc = ScalComplex(::acosh(real(cc)),0.0);
-    pokeLocalSite(cc,a,lcoor);
+    pokeLocalSite(cc,a_v,lcoor);
-  }
+  }}
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
@@ -67,9 +67,99 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
    diag_mass = 4.0 + mass;
  }
  int vol4;
  vol4=Fgrid.oSites();
  Stencil.BuildSurfaceList(1,vol4);
  vol4=Hgrid.oSites();
  StencilEven.BuildSurfaceList(1,vol4);
  StencilOdd.BuildSurfaceList(1,vol4);
 }
 template<class Impl>
 void WilsonFermion<Impl>::Report(void)
 {
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  RealD volume = 1;
  Coordinate latt = _grid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _grid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    // how to count flops here?
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call               ? : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node      ? : " << mflops/NP << std::endl;
    // how to count flops here?
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)        ? : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion<Impl>::ZeroCounters(void) {
  DhopCalls       = 0; // ok
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0; // ok
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -229,6 +319,7 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
  DerivCalls++;
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@@ -237,8 +328,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Atilde(B.Grid());
  Atilde = A;
  DerivCommTime-=usecond();
  st.HaloExchange(B, compressor);
  DerivCommTime+=usecond();
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
@@ -246,6 +340,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;
    DerivDhopComputeTime -= usecond();
    int Ls=1;
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@@ -253,7 +348,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
    DerivDhopComputeTime += usecond();
  }
  DerivComputeTime += usecond();
 }
 template <class Impl>
@@ -387,13 +484,14 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
  DhopTotalTime-=usecond();
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
 #endif
    DhopInternalSerial(st,lo,U,in,out,dag);
-
+  DhopTotalTime+=usecond();
 }
 template <class Impl>
@@ -412,38 +510,53 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  /////////////////////////////
  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
  DhopFaceTime-=usecond();
  st.HaloGather(in,compressor);
  DhopFaceTime+=usecond();
  DhopCommTime -=usecond();
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+=usecond();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
  DhopCommTime   +=usecond();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 };
@@ -455,14 +568,18 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
  DhopCommTime-=usecond();
  st.HaloExchange(in, compressor);
  DhopCommTime+=usecond();
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 };
 /*Change ends */
@@ -483,32 +600,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
+  assert(0);
  PropagatorField tmp1(_grid), tmp2(_grid);
  q_out = Zero();
  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
  // Inefficient comms method but not performance critical.
  tmp1 = Cshift(q_in_1, mu, 1);
  tmp2 = Cshift(q_in_2, mu, 1);
  auto tmp1_v  =  tmp1.View();
  auto tmp2_v  =  tmp2.View();
  auto q_in_1_v=q_in_1.View();
  auto q_in_2_v=q_in_2.View();
  auto q_out_v = q_out.View();
  auto Umu_v   =   Umu.View();
  thread_for(sU, Umu.Grid()->oSites(),{
      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
 					       q_in_2_v[sU],
 					       q_out_v[sU],
 					       Umu_v, sU, mu);
      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
 					       tmp2_v[sU],
 					       q_out_v[sU],
 					       Umu_v, sU, mu);
  });
 #else
 #endif
 }
@@ -524,62 +616,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 {
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
+  assert(0);
  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
  Complex i(0.0,1.0);
  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
  unsigned int tshift = (mu == Tp) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  q_out = Zero();
  LatticeInteger coords(_grid);
  LatticeCoordinate(coords, Tp);
  // Need q(x + mu) and q(x - mu).
  tmp    = Cshift(q_in, mu, 1);
  tmpFwd = tmp*lattice_cmplx;
  tmp    = lattice_cmplx*q_in;
  tmpBwd = Cshift(tmp, mu, -1);
  auto coords_v = coords.View();
  auto tmpFwd_v = tmpFwd.View();
  auto tmpBwd_v = tmpBwd.View();
  auto Umu_v    = Umu.View();
  auto q_out_v  = q_out.View();
  thread_for(sU, Umu.Grid()->oSites(), {
    // Compute the sequential conserved current insertion only if our simd
    // object contains a timeslice we need.
    vPredicate t_mask;
    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
    Integer timeSlices = Reduce(t_mask());
    if (timeSlices > 0) {
      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
 					  q_out_v[sU], 
 					  Umu_v, sU, mu, t_mask);
    }
    // Repeat for backward direction.
    t_mask()     = ((coords_v[sU] >= (tmin + tshift)) && 
 		    (coords_v[sU] <= (tmax + tshift)));
    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
    unsigned int t0 = 0;
    if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
    timeSlices = Reduce(t_mask());
    if (timeSlices > 0) {
      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
 					  q_out_v[sU], 
 					  Umu_v, sU, mu, t_mask);
    }
  });
 #else
 #endif
 }
 NAMESPACE_END(Grid);
@@ -0,0 +1,574 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
    Copyright (C) 2020
 Author: Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 //#if defined(A64FXASM)
 #if defined(A64FX)
 // safety include
 #include <arm_sve.h>
 // undefine everything related to kernels
 #include <simd/Fujitsu_A64FX_undef.h>
 // enable A64FX body
 #define WILSONKERNELSASMBODYA64FX
 //#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
    ///////////////////////////////////////////////////////////
    // If we are A64FX specialise the single precision routine
    ///////////////////////////////////////////////////////////
 #if defined(DSLASHINTRIN)
 //#pragma message ("A64FX Dslash: intrin")
 #include <simd/Fujitsu_A64FX_intrin_single.h>
 #else
 #pragma message ("A64FX Dslash: asm")
 #include <simd/Fujitsu_A64FX_asm_single.h>
 #endif
 /// Switch off the 5d vectorised code optimisations
 #undef DWFVEC5D
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, undag Kernel, single
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, single
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 template<> void
 WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 // undefine
 #include <simd/Fujitsu_A64FX_undef.h>
 ///////////////////////////////////////////////////////////
 // If we are A64FX specialise the double precision routine
 ///////////////////////////////////////////////////////////
 #if defined(DSLASHINTRIN)
 #include <simd/Fujitsu_A64FX_intrin_double.h>
 #else
 #include <simd/Fujitsu_A64FX_asm_double.h>
 #endif
 // former KNL
 //#define MAYBEPERM(A,perm) if (perm) { A ; }
 //#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 //#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
 #define INTERIOR_AND_EXTERIOR
 #undef  INTERIOR
 #undef  EXTERIOR
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, undag Kernel, double
 /////////////////////////////////////////////////////////////////
 #undef KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 /////////////////////////////////////////////////////////////////
 // XYZT vectorised, dag Kernel, double
 /////////////////////////////////////////////////////////////////
 #define KERNEL_DAG
 #define INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #define INTERIOR
 #undef EXTERIOR
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 #undef INTERIOR_AND_EXTERIOR
 #undef INTERIOR
 #define EXTERIOR
 template<> void
 WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 template<> void
 WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 #if defined (WILSONKERNELSASMBODYA64FX)
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 #else
 #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 #endif
 // undefs
 #undef WILSONKERNELSASMBODYA64FX
 #include <simd/Fujitsu_A64FX_undef.h>
 #endif //A64FXASM
@@ -0,0 +1,380 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: WilsonKernelsAsmBodyA64FX.h
    Copyright (C) 2020
 Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifdef KERNEL_DAG
 #define DIR0_PROJ    XP_PROJ
 #define DIR1_PROJ    YP_PROJ
 #define DIR2_PROJ    ZP_PROJ
 #define DIR3_PROJ    TP_PROJ
 #define DIR4_PROJ    XM_PROJ
 #define DIR5_PROJ    YM_PROJ
 #define DIR6_PROJ    ZM_PROJ
 #define DIR7_PROJ    TM_PROJ
 #define DIR0_RECON   XP_RECON
 #define DIR1_RECON   YP_RECON_ACCUM
 #define DIR2_RECON   ZP_RECON_ACCUM
 #define DIR3_RECON   TP_RECON_ACCUM
 #define DIR4_RECON   XM_RECON_ACCUM
 #define DIR5_RECON   YM_RECON_ACCUM
 #define DIR6_RECON   ZM_RECON_ACCUM
 #define DIR7_RECON   TM_RECON_ACCUM
 #else
 #define DIR0_PROJ    XM_PROJ
 #define DIR1_PROJ    YM_PROJ
 #define DIR2_PROJ    ZM_PROJ
 #define DIR3_PROJ    TM_PROJ
 #define DIR4_PROJ    XP_PROJ
 #define DIR5_PROJ    YP_PROJ
 #define DIR6_PROJ    ZP_PROJ
 #define DIR7_PROJ    TP_PROJ
 #define DIR0_RECON   XM_RECON
 #define DIR1_RECON   YM_RECON_ACCUM
 #define DIR2_RECON   ZM_RECON_ACCUM
 #define DIR3_RECON   TM_RECON_ACCUM
 #define DIR4_RECON   XP_RECON_ACCUM
 #define DIR5_RECON   YP_RECON_ACCUM
 #define DIR6_RECON   ZP_RECON_ACCUM
 #define DIR7_RECON   TP_RECON_ACCUM
 #endif
 //using namespace std;
 #undef SHOW
 //#define SHOW
 #undef WHERE
 #ifdef INTERIOR_AND_EXTERIOR
 #define WHERE "INT_AND_EXT"
 #endif
 #ifdef INTERIOR
 #define WHERE "INT"
 #endif
 #ifdef EXTERIOR
 #define WHERE "EXT"
 #endif
 //#pragma message("here")
 ////////////////////////////////////////////////////////////////////////////////
 // Comms then compute kernel
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef INTERIOR_AND_EXTERIOR
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
      basep = st.GetPFInfo(nent,plocal); nent++;			\
      if ( local ) {							            \
    LOAD_CHIMU(base);                                       \
    LOAD_TABLE(PERMUTE_DIR);                                \
    PROJ;							                        \
    MAYBEPERM(PERMUTE_DIR,perm);					        \
      } else {								                \
 	LOAD_CHI(base);							                \
      }									                    \
      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
    MULT_2SPIN_1(Dir);					                    \
    PREFETCH_CHIMU(base);                                   \
    PREFETCH_CHIMU_L2(basep);                               \
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
    MULT_2SPIN_2;					                        \
    if (s == 0) {                                           \
      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
    }                                                       \
    RECON;								                    \
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
  PREFETCH1_CHIMU(base);						            \
  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
 #define RESULT(base,basep) SAVE_RESULT(base,basep);
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Pre comms kernel -- prefetch like normal because it is mostly right
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef INTERIOR
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
      basep = st.GetPFInfo(nent,plocal); nent++;			\
      if ( local ) {							            \
    LOAD_CHIMU(base);                                       \
    LOAD_TABLE(PERMUTE_DIR);                                \
    PROJ;							                        \
    MAYBEPERM(PERMUTE_DIR,perm);					        \
      }else if ( st.same_node[Dir] ) {LOAD_CHI(base);}	    \
      base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++;	\
      if ( local || st.same_node[Dir] ) {				    \
    MULT_2SPIN_1(Dir);					                    \
    PREFETCH_CHIMU(base);                                   \
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
    MULT_2SPIN_2;					                        \
    if (s == 0) {                                           \
       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
    }                                                       \
    RECON;								                    \
    PREFETCH_CHIMU_L2(basep);                               \
      } else { PREFETCH_CHIMU(base); }								                    \
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;		\
  PREFETCH1_CHIMU(base);						\
  ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
 #define RESULT(base,basep) SAVE_RESULT(base,basep);
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Post comms kernel
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef EXTERIOR
 #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)			\
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
  if((!local)&&(!st.same_node[Dir]) ) {					    \
    LOAD_CHI(base);							                \
    MULT_2SPIN_1(Dir);					                    \
    PREFETCH_CHIMU(base);                                   \
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
    MULT_2SPIN_2;					                        \
    if (s == 0) {                                           \
      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
    }                                                       \
    RECON;								                    \
    nmu++;								                    \
  }
 #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)	    \
  nmu=0;								                    \
  base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
  if((!local)&&(!st.same_node[Dir]) ) {					    \
    LOAD_CHI(base);							                \
    MULT_2SPIN_1(Dir);					                    \
    PREFETCH_CHIMU(base);                                   \
    /* PREFETCH_GAUGE_L1(NxtDir); */                        \
    MULT_2SPIN_2;					                        \
    if (s == 0) {                                           \
      if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
    }                                                       \
    RECON;								                    \
    nmu++;								                    \
  }
 #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
 #endif
 {
  int nmu;
  int local,perm, ptype;
  uint64_t base;
  uint64_t basep;
  const uint64_t plocal =(uint64_t) & in[0];
  MASK_REGS;
  int nmax=U.oSites();
  for(int site=0;site<Ns;site++) {
 #ifndef EXTERIOR
    //    int sU =lo.Reorder(ssU);
    int sU =ssU;
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
    //    int sUn=lo.Reorder(ssn);
    int sUn=ssn;
    LOCK_GAUGE(0);
 #else
    int sU =ssU;
    int ssn=ssU+1;     if(ssn>=nmax) ssn=0;
    int sUn=ssn;
 #endif
    for(int s=0;s<Ls;s++) {
      ss =sU*Ls+s;
      ssn=sUn*Ls+s;
      int  ent=ss*8;// 2*Ndim
      int nent=ssn*8;
      uint64_t delta_base, delta_base_p;
   ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
 #ifdef SHOW
      float rescale = 64. * 12.;
      std::cout << "=================================================================" << std::endl;
      std::cout << "ss = " << ss << "   ssn = " << ssn << std::endl;
      std::cout << "sU = " << sU << "   ssU = " << ssU << std::endl;
      std::cout << " " << std::endl;
      std::cout << "Dir = " << Xp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Yp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Zp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Tp << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Xm << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Ym << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Zm << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
      ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
 #ifdef SHOW
      std::cout << "Dir = " << Tm << "        "  << WHERE<< std::endl;
      std::cout << "ent  nent  local  perm       = " << ent << "  " << nent << "  " << local << "  "  << perm << std::endl;
      std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
      std::cout << "base              = " << (base - plocal)/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
 #ifdef EXTERIOR
      if (nmu==0) break;
      //      if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
 #endif
      base = (uint64_t) &out[ss];
      basep= st.GetPFInfo(nent,plocal); ent++;
      basep = (uint64_t) &out[ssn];
      RESULT(base,basep);
 #ifdef SHOW
      std::cout << "Dir = FINAL        " <<  WHERE<< std::endl;;
      base_ss = base;
      std::cout << "base              = " << (base - (uint64_t) &out[0])/rescale << std::endl;
      std::cout << "Basep             = " << (basep - plocal)/rescale << std::endl;
      //printf("U                 = %llu\n", (uint64_t)&[sU](Dir));
      std::cout << "----------------------------------------------------" << std::endl;
 #endif
    }
    ssU++;
    UNLOCK_GAUGE(0);
  }
 }
 #undef DIR0_PROJ
 #undef DIR1_PROJ
 #undef DIR2_PROJ
 #undef DIR3_PROJ
 #undef DIR4_PROJ
 #undef DIR5_PROJ
 #undef DIR6_PROJ
 #undef DIR7_PROJ
 #undef DIR0_RECON
 #undef DIR1_RECON
 #undef DIR2_RECON
 #undef DIR3_RECON
 #undef DIR4_RECON
 #undef DIR5_RECON
 #undef DIR6_RECON
 #undef DIR7_RECON
 #undef ASM_LEG
 #undef ASM_LEG_XP
 #undef RESULT
@@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
  HAND_RESULT_EXT(ss,F)
 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void							\
+  template<> accelerator_inline void							\
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class Impl> void 
+template<class Impl> accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_RESULT(ss);
 }
-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_RESULT(ss);
 }
-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT(ss);
 }
-template<class Impl>
+template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_RESULT(ss);
 }
-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT_EXT(ss);
 }
-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -0,0 +1,943 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #undef LOAD_CHIMU
 #undef LOAD_CHI
 #undef MULT_2SPIN
 #undef PERMUTE_DIR
 #undef XP_PROJ
 #undef YP_PROJ
 #undef ZP_PROJ
 #undef TP_PROJ
 #undef XM_PROJ
 #undef YM_PROJ
 #undef ZM_PROJ
 #undef TM_PROJ
 #undef XP_RECON
 #undef XP_RECON_ACCUM
 #undef XM_RECON
 #undef XM_RECON_ACCUM
 #undef YP_RECON_ACCUM
 #undef YM_RECON_ACCUM
 #undef ZP_RECON_ACCUM
 #undef ZM_RECON_ACCUM
 #undef TP_RECON_ACCUM
 #undef TM_RECON_ACCUM
 #undef ZERO_RESULT
 #undef Chimu_00
 #undef Chimu_01
 #undef Chimu_02
 #undef Chimu_10
 #undef Chimu_11
 #undef Chimu_12
 #undef Chimu_20
 #undef Chimu_21
 #undef Chimu_22
 #undef Chimu_30
 #undef Chimu_31
 #undef Chimu_32
 #undef HAND_STENCIL_LEG
 #undef HAND_STENCIL_LEG_INT
 #undef HAND_STENCIL_LEG_EXT
 #undef HAND_RESULT
 #undef HAND_RESULT_INT
 #undef HAND_RESULT_EXT
 #define REGISTER
 #define LOAD_CHIMU \
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
    Chimu_02=ref()(0)(2);\
    Chimu_10=ref()(1)(0);\
    Chimu_11=ref()(1)(1);\
    Chimu_12=ref()(1)(2);\
    Chimu_20=ref()(2)(0);\
    Chimu_21=ref()(2)(1);\
    Chimu_22=ref()(2)(2);\
    Chimu_30=ref()(3)(0);\
    Chimu_31=ref()(3)(1);\
    Chimu_32=ref()(3)(2);\
    std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \
    std::cout << "Chimu_00 -- " <<  Chimu_00 << std::endl; \
    std::cout << "Chimu_01 -- " <<  Chimu_01 << std::endl; \
    std::cout << "Chimu_02 -- " <<  Chimu_02 << std::endl; \
    std::cout << "Chimu_10 -- " <<  Chimu_10 << std::endl; \
    std::cout << "Chimu_11 -- " <<  Chimu_11 << std::endl; \
    std::cout << "Chimu_12 -- " <<  Chimu_12 << std::endl; \
    std::cout << "Chimu_20 -- " <<  Chimu_20 << std::endl; \
    std::cout << "Chimu_21 -- " <<  Chimu_21 << std::endl; \
    std::cout << "Chimu_22 -- " <<  Chimu_22 << std::endl; \
    std::cout << "Chimu_30 -- " <<  Chimu_30 << std::endl; \
    std::cout << "Chimu_31 -- " <<  Chimu_31 << std::endl; \
    std::cout << "Chimu_32 -- " <<  Chimu_32 << std::endl; \
 }
 #define LOAD_CHI\
  {const SiteHalfSpinor &ref(buf[offset]);	\
    Chi_00 = ref()(0)(0);\
    Chi_01 = ref()(0)(1);\
    Chi_02 = ref()(0)(2);\
    Chi_10 = ref()(1)(0);\
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);\
    std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \
    std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
    std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
    std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
    std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
    std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
    std::cout << "Chi_12 -- " <<  Chi_12 << std::endl; \
  }
 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
  {auto & ref(U[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));	\
   Impl::loadLinkElement(U_10,ref()(1,0));	\
   Impl::loadLinkElement(U_20,ref()(2,0));	\
   Impl::loadLinkElement(U_01,ref()(0,1));	\
   Impl::loadLinkElement(U_11,ref()(1,1));	\
   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
    UChi_11 = U_10*Chi_10;\
    UChi_02 = U_20*Chi_00;\
    UChi_12 = U_20*Chi_10;\
    UChi_00+= U_01*Chi_01;\
    UChi_10+= U_01*Chi_11;\
    UChi_01+= U_11*Chi_01;\
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
    Impl::loadLinkElement(U_00,ref()(0,2));	\
    Impl::loadLinkElement(U_10,ref()(1,2));	\
    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
    UChi_11+= U_10*Chi_12;\
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;\
    std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \
    std::cout << "UChi_00 -- " <<  UChi_00 << std::endl; \
    std::cout << "UChi_01 -- " <<  UChi_01 << std::endl; \
    std::cout << "UChi_02 -- " <<  UChi_02 << std::endl; \
    std::cout << "UChi_10 -- " <<  UChi_10 << std::endl; \
    std::cout << "UChi_11 -- " <<  UChi_11 << std::endl; \
    std::cout << "UChi_12 -- " <<  UChi_12 << std::endl; \
    }
 #define PERMUTE_DIR(dir)			\
 std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \
 std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
 std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
 std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
 std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
 std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
 std::cout << "Chi_12 -- " <<  Chi_12 << std::endl; \
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
      permute##dir(Chi_02,Chi_02);\
      permute##dir(Chi_10,Chi_10);\
      permute##dir(Chi_11,Chi_11);\
      permute##dir(Chi_12,Chi_12);\
  std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \
  std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
  std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
  std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
  std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
  std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
  std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
 #define XP_PROJ \
    Chi_00 = Chimu_00+timesI(Chimu_30);\
    Chi_01 = Chimu_01+timesI(Chimu_31);\
    Chi_02 = Chimu_02+timesI(Chimu_32);\
    Chi_10 = Chimu_10+timesI(Chimu_20);\
    Chi_11 = Chimu_11+timesI(Chimu_21);\
    Chi_12 = Chimu_12+timesI(Chimu_22);\
    std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \
    std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
    std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
    std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
    std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
    std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
    std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 #define YP_PROJ \
    Chi_00 = Chimu_00-Chimu_30;\
    Chi_01 = Chimu_01-Chimu_31;\
    Chi_02 = Chimu_02-Chimu_32;\
    Chi_10 = Chimu_10+Chimu_20;\
    Chi_11 = Chimu_11+Chimu_21;\
    Chi_12 = Chimu_12+Chimu_22;\
    std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \
    std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
    std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
    std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
    std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
    std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
    std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 #define ZP_PROJ \
  Chi_00 = Chimu_00+timesI(Chimu_20);		\
  Chi_01 = Chimu_01+timesI(Chimu_21);		\
  Chi_02 = Chimu_02+timesI(Chimu_22);		\
  Chi_10 = Chimu_10-timesI(Chimu_30);		\
  Chi_11 = Chimu_11-timesI(Chimu_31);		\
  Chi_12 = Chimu_12-timesI(Chimu_32);\
  std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \
  std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
  std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
  std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
  std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
  std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
  std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 #define TP_PROJ \
  Chi_00 = Chimu_00+Chimu_20;		\
  Chi_01 = Chimu_01+Chimu_21;		\
  Chi_02 = Chimu_02+Chimu_22;		\
  Chi_10 = Chimu_10+Chimu_30;		\
  Chi_11 = Chimu_11+Chimu_31;		\
  Chi_12 = Chimu_12+Chimu_32;\
  std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \
  std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
  std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
  std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
  std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
  std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
  std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 //      hspin(0)=fspin(0)-timesI(fspin(3));
 //      hspin(1)=fspin(1)-timesI(fspin(2));
 #define XM_PROJ \
    Chi_00 = Chimu_00-timesI(Chimu_30);\
    Chi_01 = Chimu_01-timesI(Chimu_31);\
    Chi_02 = Chimu_02-timesI(Chimu_32);\
    Chi_10 = Chimu_10-timesI(Chimu_20);\
    Chi_11 = Chimu_11-timesI(Chimu_21);\
    Chi_12 = Chimu_12-timesI(Chimu_22);\
    std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \
    std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
    std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
    std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
    std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
    std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
    std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 #define YM_PROJ \
    Chi_00 = Chimu_00+Chimu_30;\
    Chi_01 = Chimu_01+Chimu_31;\
    Chi_02 = Chimu_02+Chimu_32;\
    Chi_10 = Chimu_10-Chimu_20;\
    Chi_11 = Chimu_11-Chimu_21;\
    Chi_12 = Chimu_12-Chimu_22;\
    std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \
    std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
    std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
    std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
    std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
    std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
    std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 #define ZM_PROJ \
  Chi_00 = Chimu_00-timesI(Chimu_20);		\
  Chi_01 = Chimu_01-timesI(Chimu_21);		\
  Chi_02 = Chimu_02-timesI(Chimu_22);		\
  Chi_10 = Chimu_10+timesI(Chimu_30);		\
  Chi_11 = Chimu_11+timesI(Chimu_31);		\
  Chi_12 = Chimu_12+timesI(Chimu_32);\
  std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \
  std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
  std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
  std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
  std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
  std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
  std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 #define TM_PROJ \
  Chi_00 = Chimu_00-Chimu_20;		\
  Chi_01 = Chimu_01-Chimu_21;		\
  Chi_02 = Chimu_02-Chimu_22;		\
  Chi_10 = Chimu_10-Chimu_30;		\
  Chi_11 = Chimu_11-Chimu_31;		\
  Chi_12 = Chimu_12-Chimu_32;\
  std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \
  std::cout << "Chi_00 -- " <<  Chi_00 << std::endl; \
  std::cout << "Chi_01 -- " <<  Chi_01 << std::endl; \
  std::cout << "Chi_02 -- " <<  Chi_02 << std::endl; \
  std::cout << "Chi_10 -- " <<  Chi_10 << std::endl; \
  std::cout << "Chi_11 -- " <<  Chi_11 << std::endl; \
  std::cout << "Chi_12 -- " <<  Chi_12 << std::endl;
 //      fspin(0)=hspin(0);
 //      fspin(1)=hspin(1);
 //      fspin(2)=timesMinusI(hspin(1));
 //      fspin(3)=timesMinusI(hspin(0));
 #define XP_RECON\
  result_00 = UChi_00;\
  result_01 = UChi_01;\
  result_02 = UChi_02;\
  result_10 = UChi_10;\
  result_11 = UChi_11;\
  result_12 = UChi_12;\
  result_20 = timesMinusI(UChi_10);\
  result_21 = timesMinusI(UChi_11);\
  result_22 = timesMinusI(UChi_12);\
  result_30 = timesMinusI(UChi_00);\
  result_31 = timesMinusI(UChi_01);\
  result_32 = timesMinusI(UChi_02);\
  std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define XP_RECON_ACCUM\
  result_00+=UChi_00;\
  result_01+=UChi_01;\
  result_02+=UChi_02;\
  result_10+=UChi_10;\
  result_11+=UChi_11;\
  result_12+=UChi_12;\
  result_20-=timesI(UChi_10);\
  result_21-=timesI(UChi_11);\
  result_22-=timesI(UChi_12);\
  result_30-=timesI(UChi_00);\
  result_31-=timesI(UChi_01);\
  result_32-=timesI(UChi_02);\
  std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define XM_RECON\
  result_00 = UChi_00;\
  result_01 = UChi_01;\
  result_02 = UChi_02;\
  result_10 = UChi_10;\
  result_11 = UChi_11;\
  result_12 = UChi_12;\
  result_20 = timesI(UChi_10);\
  result_21 = timesI(UChi_11);\
  result_22 = timesI(UChi_12);\
  result_30 = timesI(UChi_00);\
  result_31 = timesI(UChi_01);\
  result_32 = timesI(UChi_02);\
  std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define XM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= timesI(UChi_10);\
  result_21+= timesI(UChi_11);\
  result_22+= timesI(UChi_12);\
  result_30+= timesI(UChi_00);\
  result_31+= timesI(UChi_01);\
  result_32+= timesI(UChi_02);\
  std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define YP_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= UChi_10;\
  result_21+= UChi_11;\
  result_22+= UChi_12;\
  result_30-= UChi_00;\
  result_31-= UChi_01;\
  result_32-= UChi_02;\
  std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define YM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20-= UChi_10;\
  result_21-= UChi_11;\
  result_22-= UChi_12;\
  result_30+= UChi_00;\
  result_31+= UChi_01;\
  result_32+= UChi_02;\
  std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define ZP_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20-= timesI(UChi_00);			\
  result_21-= timesI(UChi_01);			\
  result_22-= timesI(UChi_02);			\
  result_30+= timesI(UChi_10);			\
  result_31+= timesI(UChi_11);			\
  result_32+= timesI(UChi_12);\
  std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define ZM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= timesI(UChi_00);			\
  result_21+= timesI(UChi_01);			\
  result_22+= timesI(UChi_02);			\
  result_30-= timesI(UChi_10);			\
  result_31-= timesI(UChi_11);			\
  result_32-= timesI(UChi_12);\
  std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define TP_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= UChi_00;			\
  result_21+= UChi_01;			\
  result_22+= UChi_02;			\
  result_30+= UChi_10;			\
  result_31+= UChi_11;			\
  result_32+= UChi_12;\
  std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define TM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20-= UChi_00;	\
  result_21-= UChi_01;	\
  result_22-= UChi_02;	\
  result_30-= UChi_10;	\
  result_31-= UChi_11;	\
  result_32-= UChi_12;\
  std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \
  std::cout << "result_00 -- " <<  result_00 << std::endl; \
  std::cout << "result_01 -- " <<  result_01 << std::endl; \
  std::cout << "result_02 -- " <<  result_02 << std::endl; \
  std::cout << "result_10 -- " <<  result_10 << std::endl; \
  std::cout << "result_11 -- " <<  result_11 << std::endl; \
  std::cout << "result_12 -- " <<  result_12 << std::endl; \
  std::cout << "result_20 -- " <<  result_20 << std::endl; \
  std::cout << "result_21 -- " <<  result_21 << std::endl; \
  std::cout << "result_22 -- " <<  result_22 << std::endl; \
  std::cout << "result_30 -- " <<  result_30 << std::endl; \
  std::cout << "result_31 -- " <<  result_31 << std::endl; \
  std::cout << "result_32 -- " <<  result_32 << std::endl;
 #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
    LOAD_CHIMU;					\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else {					\
    LOAD_CHI;					\
  }						\
  MULT_2SPIN(DIR);				\
  RECON;
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
    LOAD_CHIMU;					\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else if ( st.same_node[DIR] ) {		\
    LOAD_CHI;					\
  }						\
  if (local || st.same_node[DIR] ) {		\
    MULT_2SPIN(DIR);				\
    RECON;					\
  }
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
    LOAD_CHI;					\
    MULT_2SPIN(DIR);				\
    RECON;					\
    nmu++;					\
  }
 #define HAND_RESULT(ss)				\
  {						\
    SiteSpinor & ref (out[ss]);		\
    vstream(ref()(0)(0),result_00);		\
    vstream(ref()(0)(1),result_01);		\
    vstream(ref()(0)(2),result_02);		\
    vstream(ref()(1)(0),result_10);		\
    vstream(ref()(1)(1),result_11);		\
    vstream(ref()(1)(2),result_12);		\
    vstream(ref()(2)(0),result_20);		\
    vstream(ref()(2)(1),result_21);		\
    vstream(ref()(2)(2),result_22);		\
    vstream(ref()(3)(0),result_30);		\
    vstream(ref()(3)(1),result_31);		\
    vstream(ref()(3)(2),result_32);		\
    std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \
    std::cout << "result_00 -- " <<  result_00 << std::endl; \
    std::cout << "result_01 -- " <<  result_01 << std::endl; \
    std::cout << "result_02 -- " <<  result_02 << std::endl; \
    std::cout << "result_10 -- " <<  result_10 << std::endl; \
    std::cout << "result_11 -- " <<  result_11 << std::endl; \
    std::cout << "result_12 -- " <<  result_12 << std::endl; \
    std::cout << "result_20 -- " <<  result_20 << std::endl; \
    std::cout << "result_21 -- " <<  result_21 << std::endl; \
    std::cout << "result_22 -- " <<  result_22 << std::endl; \
    std::cout << "result_30 -- " <<  result_30 << std::endl; \
    std::cout << "result_31 -- " <<  result_31 << std::endl; \
    std::cout << "result_32 -- " <<  result_32 << std::endl;\
  }
 #define HAND_RESULT_EXT(ss)			\
  if (nmu){					\
    SiteSpinor & ref (out[ss]);		\
    ref()(0)(0)+=result_00;		\
    ref()(0)(1)+=result_01;		\
    ref()(0)(2)+=result_02;		\
    ref()(1)(0)+=result_10;		\
    ref()(1)(1)+=result_11;		\
    ref()(1)(2)+=result_12;		\
    ref()(2)(0)+=result_20;		\
    ref()(2)(1)+=result_21;		\
    ref()(2)(2)+=result_22;		\
    ref()(3)(0)+=result_30;		\
    ref()(3)(1)+=result_31;		\
    ref()(3)(2)+=result_32;		\
    std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \
    std::cout << "result_00 -- " <<  result_00 << std::endl; \
    std::cout << "result_01 -- " <<  result_01 << std::endl; \
    std::cout << "result_02 -- " <<  result_02 << std::endl; \
    std::cout << "result_10 -- " <<  result_10 << std::endl; \
    std::cout << "result_11 -- " <<  result_11 << std::endl; \
    std::cout << "result_12 -- " <<  result_12 << std::endl; \
    std::cout << "result_20 -- " <<  result_20 << std::endl; \
    std::cout << "result_21 -- " <<  result_21 << std::endl; \
    std::cout << "result_22 -- " <<  result_22 << std::endl; \
    std::cout << "result_30 -- " <<  result_30 << std::endl; \
    std::cout << "result_31 -- " <<  result_31 << std::endl; \
    std::cout << "result_32 -- " <<  result_32 << std::endl;\
  }
 #define HAND_DECLARATIONS(a)			\
  Simd result_00;				\
  Simd result_01;				\
  Simd result_02;				\
  Simd result_10;				\
  Simd result_11;				\
  Simd result_12;				\
  Simd result_20;				\
  Simd result_21;				\
  Simd result_22;				\
  Simd result_30;				\
  Simd result_31;				\
  Simd result_32;				\
  Simd Chi_00;					\
  Simd Chi_01;					\
  Simd Chi_02;					\
  Simd Chi_10;					\
  Simd Chi_11;					\
  Simd Chi_12;					\
  Simd UChi_00;					\
  Simd UChi_01;					\
  Simd UChi_02;					\
  Simd UChi_10;					\
  Simd UChi_11;					\
  Simd UChi_12;					\
  Simd U_00;					\
  Simd U_10;					\
  Simd U_20;					\
  Simd U_01;					\
  Simd U_11;					\
  Simd U_21;\
  Simd debugreg;\
  svbool_t pg1;        \
  pg1 = svptrue_b64();        \
 #define ZERO_RESULT				\
  result_00=Zero();				\
  result_01=Zero();				\
  result_02=Zero();				\
  result_10=Zero();				\
  result_11=Zero();				\
  result_12=Zero();				\
  result_20=Zero();				\
  result_21=Zero();				\
  result_22=Zero();				\
  result_30=Zero();				\
  result_31=Zero();				\
  result_32=Zero();
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
 #define Chimu_02 Chi_02
 #define Chimu_10 Chi_10
 #define Chimu_11 Chi_11
 #define Chimu_12 Chi_12
 #define Chimu_20 UChi_00
 #define Chimu_21 UChi_01
 #define Chimu_22 UChi_02
 #define Chimu_30 UChi_10
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12
 NAMESPACE_BEGIN(Grid);
 template<class Impl> void
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl> void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl> void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT_EXT(ss);
 }
 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  StencilEntry *SE;
  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT_EXT(ss);
 }
 ////////////// Wilson ; uses this implementation /////////////////////
 NAMESPACE_END(Grid);
 #undef LOAD_CHIMU
 #undef LOAD_CHI
 #undef MULT_2SPIN
 #undef PERMUTE_DIR
 #undef XP_PROJ
 #undef YP_PROJ
 #undef ZP_PROJ
 #undef TP_PROJ
 #undef XM_PROJ
 #undef YM_PROJ
 #undef ZM_PROJ
 #undef TM_PROJ
 #undef XP_RECON
 #undef XP_RECON_ACCUM
 #undef XM_RECON
 #undef XM_RECON_ACCUM
 #undef YP_RECON_ACCUM
 #undef YM_RECON_ACCUM
 #undef ZP_RECON_ACCUM
 #undef ZM_RECON_ACCUM
 #undef TP_RECON_ACCUM
 #undef TM_RECON_ACCUM
 #undef ZERO_RESULT
 #undef Chimu_00
 #undef Chimu_01
 #undef Chimu_02
 #undef Chimu_10
 #undef Chimu_11
 #undef Chimu_12
 #undef Chimu_20
 #undef Chimu_21
 #undef Chimu_22
 #undef Chimu_30
 #undef Chimu_31
 #undef Chimu_32
 #undef HAND_STENCIL_LEG
 #undef HAND_STENCIL_LEG_INT
 #undef HAND_STENCIL_LEG_EXT
 #undef HAND_RESULT
 #undef HAND_RESULT_INT
 #undef HAND_RESULT_EXT
@@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
 // Generic implementation; move to different file?
 ////////////////////////////////////////////
 /*
 accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 {
-#ifdef __CUDA_ARCH__
+#ifdef GRID_SIMT
  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
  uint4 * chip_pun = (uint4 *)&chip;
@@ -51,6 +52,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 #endif
  return;
 }
 */
 #define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
  SE = st.GetEntry(ptype, Dir, sF);				\
@@ -61,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else {							\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  synchronise();						\
+  acceleratorSynchronise();						\
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
  Recon(result, Uchi);
@@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  } else if ( st.same_node[Dir] ) {				\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  synchronise();						\
+  acceleratorSynchronise();						\
  if (SE->_is_local || st.same_node[Dir] ) {			\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
  }								\
-  synchronise();						
+  acceleratorSynchronise();
 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
@@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    Recon(result, Uchi);					\
    nmu++;							\
  }								\
-  synchronise();						
+  acceleratorSynchronise();
 #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
    if (SE->_is_local ) {					\
@@ -99,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    } else {							\
      chi = coalescedRead(buf[SE->_offset],lane);		\
    }								\
-    synchronise();						\
+    acceleratorSynchronise();					\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
    Recon(result, Uchi);
@@ -112,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -126,7 +128,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
@@ -138,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  coalescedWrite(out[sF],result,lane);
 };
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteHalfSpinor *buf, int sF,
 					  int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -153,7 +155,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
@@ -167,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -181,7 +183,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
@@ -195,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  coalescedWrite(out[sF], result,lane);
 };
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -203,7 +205,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  calcHalfSpinor chi;
  //  calcHalfSpinor *chi_p;
@@ -225,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -239,7 +241,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  int ptype;
  int nmu=0;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
@@ -256,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  }
 };
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -270,7 +272,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  int ptype;
  int nmu=0;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  result=Zero();
  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
@@ -288,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 };
 #define DhopDirMacro(Dir,spProj,spRecon)	\
-  template <class Impl>							\
+  template <class Impl> accelerator_inline				\
  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
@@ -300,7 +302,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  StencilEntry *SE;							\
  int ptype;								\
  const int Nsimd = SiteHalfSpinor::Nsimd();				\
-  const int lane=SIMTlane(Nsimd);					\
+  const int lane=acceleratorSIMTlane(Nsimd);					\
 									\
  SE = st.GetEntry(ptype, dir, sF);					\
  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
@@ -316,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
 DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);
-template <class Impl> 
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {
@@ -328,7 +330,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  StencilEntry *SE;
  int ptype;
  const int Nsimd = SiteHalfSpinor::Nsimd();
-  const int lane=SIMTlane(Nsimd);
+  const int lane=acceleratorSIMTlane(Nsimd);
  SE = st.GetEntry(ptype, dir, sF);
  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
@@ -346,30 +348,30 @@ template <class Impl>
 void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 				      int Nsite, const FermionField &in, std::vector<FermionField> &out)
 {
-   auto U_v   = U.View();
+   autoView(U_v  ,U,AcceleratorRead);
-   auto in_v  = in.View();
+   autoView(in_v ,in,AcceleratorRead);
-   auto st_v  = st.View();
+   autoView(st_v ,st,AcceleratorRead);
-   auto out_Xm = out[0].View();
+   autoView(out_Xm,out[0],AcceleratorWrite);
-   auto out_Ym = out[1].View();
+   autoView(out_Ym,out[1],AcceleratorWrite);
-   auto out_Zm = out[2].View();
+   autoView(out_Zm,out[2],AcceleratorWrite);
-   auto out_Tm = out[3].View();
+   autoView(out_Tm,out[3],AcceleratorWrite);
-   auto out_Xp = out[4].View();
+   autoView(out_Xp,out[4],AcceleratorWrite);
-   auto out_Yp = out[5].View();
+   autoView(out_Yp,out[5],AcceleratorWrite);
-   auto out_Zp = out[6].View();
+   autoView(out_Zp,out[6],AcceleratorWrite);
-   auto out_Tp = out[7].View();
+   autoView(out_Tp,out[7],AcceleratorWrite);
-
+   auto CBp=st.CommBuf();
-   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
+   accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
      int sU=sss/Ls;
      int sF =sss;
-      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
+      DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
-      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
+      DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
-      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
+      DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
-      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
+      DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
-      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
+      DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
-      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
+      DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
-      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
+      DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
-      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
+      DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
   });
 }
@@ -381,17 +383,18 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
  assert(dirdisp<=7);
  assert(dirdisp>=0);
-   auto U_v   = U.View();
+   autoView(U_v  ,U  ,AcceleratorRead);
-   auto in_v  = in.View();
+   autoView(in_v ,in ,AcceleratorRead);
-   auto out_v = out.View();
+   autoView(out_v,out,AcceleratorWrite);
-   auto st_v  = st.View();
+   autoView(st_v ,st ,AcceleratorRead);
   auto CBp=st.CommBuf();
 #define LoopBody(Dir)				\
   case Dir :					\
-     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
+     accelerator_for(ss,Nsite,Simd::Nsimd(),{	\
       for(int s=0;s<Ls;s++){			\
 	 int sU=ss;				\
 	 int sF = s+Ls*sU;						\
-	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
+	 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
       }							       \
       });							       \
     break;
@@ -435,26 +438,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 				     int Ls, int Nsite, const FermionField &in, FermionField &out,
 				     int interior,int exterior)
 {
-    auto U_v   =   U.View();
+    autoView(U_v  ,  U,AcceleratorRead);
-    auto in_v  =  in.View();
+    autoView(in_v , in,AcceleratorRead);
-    auto out_v = out.View();
+    autoView(out_v,out,AcceleratorWrite);
-    auto st_v  =  st.View();
+    autoView(st_v , st,AcceleratorRead);
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
@@ -466,26 +469,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 					  int Ls, int Nsite, const FermionField &in, FermionField &out,
 					  int interior,int exterior)
  {
-    auto U_v   = U.View();
+    autoView(U_v  ,U,AcceleratorRead);
-    auto in_v  = in.View();
+    autoView(in_v ,in,AcceleratorRead);
-    auto out_v = out.View();
+    autoView(out_v,out,AcceleratorWrite);
-    auto st_v  = st.View();
+    autoView(st_v ,st,AcceleratorRead);
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
-#ifndef GRID_NVCC
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
@@ -493,5 +496,8 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   assert(0 && " Kernel optimisation case not covered ");
  }
-NAMESPACE_END(Grid);
+#undef KERNEL_CALLNB
 #undef KERNEL_CALL
 #undef ASM_CALL
 NAMESPACE_END(Grid);
@@ -0,0 +1,36 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
 const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
 NAMESPACE_END(Grid);
@@ -0,0 +1,37 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class NaiveStaggeredFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../NaiveStaggeredFermionInstantiation.cc.master
@@ -0,0 +1 @@
 ../NaiveStaggeredFermionInstantiation.cc.master
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`../NaiveStaggeredFermionInstantiation.cc.master`