more I/O benchmark code cleaning

script to convert I/O benchmark logs to CSV
I/O benchmark code cleaning
2026-06-30 15:33:29 +01:00 · 2020-10-07 15:38:41 +01:00 · 2020-10-07 15:35:03 +01:00 · 2020-10-07 15:31:51 +01:00 · 2020-10-06 17:58:16 +01:00 · 2020-10-06 17:57:00 +01:00
50 changed files with 968 additions and 1487 deletions
@@ -34,6 +34,12 @@
 #define __SYCL__REDEFINE__
 #endif

+/* HIP save and restore compile environment*/
+#ifdef GRID_HIP
+#pragma push
+#pragma push_macro("__HIP_DEVICE_COMPILE__")
+#endif
+#define EIGEN_NO_HIP

 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -52,6 +58,12 @@
 #pragma pop
 #endif

+/*HIP restore*/
+#ifdef __HIP__REDEFINE__
+#pragma pop_macro("__HIP_DEVICE_COMPILE__")
+#pragma pop
+#endif
+
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -49,11 +49,13 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
  Lattice<dotp> fine_inner_msk(fine);

  // Multiply could be fused with innerProduct
+  // Single block sum kernel could do both masks.
  fine_inner = localInnerProduct(fineX,fineY);
  mult(fine_inner_msk, fine_inner,FineMask);
  blockSum(CoarseInner,fine_inner_msk);
 }

+
 class Geometry {
 public:
  int npoint;
@@ -78,12 +80,8 @@ public:
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
-
-    std::cout <<GridLogMessage << "Geometry "<<std::endl;
-    for(int p=0;p<npoint;p++){
-      std::cout <<GridLogMessage << "point " <<p<<" dir "<<directions[p]<<" delta " <<displacements[p]<<std::endl;
-    }
  }
+
 };
  
 template<class Fobj,class CComplex,int nbasis>
@@ -104,8 +102,8 @@ public:
  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
    CoarseGrid(_CoarseGrid),
    FineGrid(_FineGrid),
-    checkerboard(_checkerboard),
-    subspace(nbasis,_FineGrid)
+    subspace(nbasis,_FineGrid),
+    checkerboard(_checkerboard)
  {
  };
  
@@ -287,8 +285,6 @@ public:
  ///////////////////////
  GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know

-  virtual std::vector<int> Directions(void)   { return geom.directions; };
-  virtual std::vector<int> Displacements(void){ return geom.displacements; };
  void M (const CoarseVector &in, CoarseVector &out)
  {
    conformable(_grid,in.Grid());
@@ -312,9 +308,6 @@ public:

    int osites=Grid()->oSites();

-    autoView(st,Stencil,AcceleratorRead);
-    siteVector *CBp=Stencil.CommBuf();
-
    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
      int ss = sss/nbasis;
      int b  = sss%nbasis;
@@ -325,12 +318,12 @@ public:

      for(int point=0;point<geom.npoint;point++){

-	SE=st.GetEntry(ptype,point,ss);
+	SE=Stencil.GetEntry(ptype,point,ss);
 	  
 	if(SE->_is_local) { 
 	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(CBp[SE->_offset]);
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
 	}
 	acceleratorSynchronise();

@@ -339,7 +332,7 @@ public:
 	}
      }
      coalescedWrite(out_v[ss](b),res);
-    });
+      });

    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  };
@@ -416,23 +409,38 @@ public:
      MdirCalc(in,out[p],p);
    }
  };
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp)
-  {
+  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
    this->MdirComms(in);

    int ndim = in.Grid()->Nd();

-    int point=-1;
-    for(int p=0;p<geom.npoint;p++){
-      if( (dir==geom.directions[p])&&(disp==geom.displacements[p])) point=p;
-    }
-    assert(point!=-1);// Must find
+    //////////////
+    // 4D action like wilson
+    // 0+ => 0 
+    // 0- => 1
+    // 1+ => 2 
+    // 1- => 3
+    // etc..
+    //////////////
+    // 5D action like DWF
+    // 1+ => 0 
+    // 1- => 1
+    // 2+ => 2 
+    // 2- => 3
+    // etc..
+    auto point = [dir, disp, ndim](){
+      if(dir == 0 and disp == 0)
+	return 8;
+      else if ( ndim==4 ) { 
+	return (4 * dir + 1 - disp) / 2;
+      } else { 
+	return (4 * (dir-1) + 1 - disp) / 2;
+      }
+    }();

-    std::cout <<GridLogMessage << "Mdir point "<<point<<" dir "<<dir<<" disp "<<disp  <<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      std::cout <<GridLogMessage << "point " <<p<<" dir "<<geom.directions[p]<<" delta " <<geom.displacements[p]<<std::endl;
-    }
    MdirCalc(in,out,point);
+
  };

  void Mdiag(const CoarseVector &in, CoarseVector &out)
@@ -448,58 +456,10 @@ public:
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    A(geom.npoint,&CoarseGrid)
+      A(geom.npoint,&CoarseGrid)
  {
  };

-  void Test(Aggregation<Fobj,CComplex,nbasis> &_Aggregates,GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop)
-  {
-    typedef Lattice<Fobj> FineField;
-    CoarseVector Cin(_grid);
-    CoarseVector Cout(_grid);
-    CoarseVector CFout(_grid);
-
-    FineField Fin(FineGrid);
-    FineField Fout(FineGrid);
-
-
-    std::vector<int> seeds({1,2,3,4,5});
-    GridParallelRNG RNG(_grid);  RNG.SeedFixedIntegers(seeds);
-    gaussian(RNG,Cin);
-
-    _Aggregates.PromoteFromSubspace(Cin,Fin);
-    _Aggregates.ProjectToSubspace(Cin,Fin);
-
-    std::cout << GridLogMessage<< "************  "<<std::endl;
-    std::cout << GridLogMessage<< " Testing M  "<<std::endl;
-    std::cout << GridLogMessage<< "************  "<<std::endl;
-    // Coarse operator
-    this->M(Cin,Cout);
-    // Fine projected operator
-    _Aggregates.PromoteFromSubspace(Cin,Fin);
-    linop.Op(Fin,Fout);
-    _Aggregates.ProjectToSubspace(CFout,Fout);
-
-    CFout = CFout-Cout;
-    RealD diff = norm2(CFout);
-    std::cout << GridLogMessage<< " diff  "<<diff<<std::endl;
-    assert(diff<1.0e-5);
-
-    std::cout << GridLogMessage<< "************  "<<std::endl;
-    std::cout << GridLogMessage<< " Testing Mdag  "<<std::endl;
-    std::cout << GridLogMessage<< "************  "<<std::endl;
-    // Coarse operator
-    Mdag(Cin,Cout);
-    // Fine operator
-    linop.AdjOp(Fin,Fout);
-    _Aggregates.ProjectToSubspace(CFout,Fout);
-
-    CFout = CFout-Cout;
-    diff = norm2(CFout);
-    std::cout << GridLogMessage<< " diff  "<<diff<<std::endl; 
-    assert(diff<1.0e-5);
-  }
-
  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
@@ -536,19 +496,8 @@ public:

    CoarseScalar InnerProd(Grid()); 

-    std::cout << GridLogMessage<< "CoarsenMatrix Orthog " << std::endl;
    // Orthogonalise the subblocks over the basis
    blockOrthogonalise(InnerProd,Subspace.subspace);
-    std::cout << GridLogMessage<< "CoarsenMatrix Orthog done " << std::endl;
-
-    auto OpDirections    = linop.Directions();
-    auto OpDisplacements = linop.Displacements();
-
-    std::cout<<" Coarsening an operator with "<< OpDirections.size()<<" terms "<<std::endl;
-    for(int p=0;p<OpDirections.size();p++) {
-      assert(OpDirections[p]==geom.directions[p]);
-      assert(OpDisplacements[p]==geom.displacements[p]);
-    }

    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
@@ -584,27 +533,13 @@ public:
    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
    oddmask  = one-evenmask;

-    /*
-    {
-      phi=Subspace.subspace[0];
-      linop.OpDirAll(phi,Mphi_p);
-      for(int p=0;p<geom.npoint-1;p++){
-	int dir=geom.directions[p];
-	int disp=geom.displacements[p];
-	linop.OpDir(phi,Mphi,dir,disp);
-	Mphi=Mphi-Mphi_p[p];
-	std::cout << GridLogMessage <<" Direction mapping check " <<norm2(Mphi)<<std::endl;
-      }
-    }
-*/
    assert(self_stencil!=-1);
-    int lhermitian=hermitian;

    for(int i=0;i<nbasis;i++){

      phi=Subspace.subspace[i];

-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
      linop.OpDirAll(phi,Mphi_p);
      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);

@@ -615,7 +550,7 @@ public:
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];

-	if ( (disp==-1) || (!lhermitian ) ) {
+	if ( (disp==-1) || (!hermitian ) ) {

 	  ////////////////////////////////////////////////////////////////////////
 	  // Pick out contributions coming from this cell and neighbour cell
@@ -633,23 +568,11 @@ public:
 	    autoView( A_self  , A[self_stencil], AcceleratorWrite);

 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
-	    if ( lhermitian && (disp==-1) ) {
-	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
-		int dirp   = geom.directions[pp];
-		int dispp  = geom.displacements[pp];
-		if ( (dirp==dir) && (dispp==1) ){
-		  auto sft = conjugate(Cshift(oZProj,dir,1));
-		  autoView( sft_v    ,  sft  , AcceleratorWrite);
-		  autoView( A_pp     ,  A[pp], AcceleratorWrite);
-		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
-		}
-	      }
-	    }
+
 	  }
 	}
      }

-      std::cout << GridLogMessage<< "CoarsenMatrix Diag "<<std::endl;
      ///////////////////////////////////////////
      // Faster alternate self coupling.. use hermiticity to save 2x
      ///////////////////////////////////////////
@@ -681,35 +604,31 @@ public:

      }
    }
-
-    MemoryManager::PrintBytes();
-
-    // Auto self test
-    Test( Subspace,FineGrid,linop);
-
-#if 0
-    ///////////////////////////
-    // test code worth preserving in if block
-    ///////////////////////////
-    std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
-      std::cout<<GridLogMessage<< "\n"<<A[p] << std::endl;
+    if(hermitian) {
+      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
+      ForceHermitian();
    }
-    std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
-
-    phi=Subspace.subspace[0];
-    std::vector<int> bc(FineGrid->_ndimension,0);
-    blockPick(Grid(),phi,tmp,bc);      // Pick out a block
-    linop.Op(tmp,Mphi);                // Apply big dop
-    blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
-    std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
-    std::cout<<GridLogMessage<< iProj <<std::endl;
-    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
-#endif
-
  }

+  void ForceHermitian(void) {
+    CoarseMatrix Diff  (Grid());
+    for(int p=0;p<geom.npoint;p++){
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
+      if(disp==-1) {
+	// Find the opposite link
+	for(int pp=0;pp<geom.npoint;pp++){
+	  int dirp   = geom.directions[pp];
+	  int dispp  = geom.displacements[pp];
+	  if ( (dirp==dir) && (dispp==1) ){
+	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
+	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
+	    A[pp] = adj(Cshift(A[p],dir,1));
+	  }
+	}
+      }
+    }
+  }
 };

 NAMESPACE_END(Grid);
@@ -52,9 +52,6 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
-
-  virtual std::vector<int> Directions(void)   =0;
-  virtual std::vector<int> Displacements(void)=0;
 };


@@ -79,9 +76,6 @@ class MdagMLinearOperator : public LinearOperatorBase<Field> {
 public:
  MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};

-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
-
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
    _Mat.Mdiag(in,out);
@@ -117,8 +111,6 @@ class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  RealD _shift;
 public:
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
@@ -159,8 +151,6 @@ template<class Matrix,class Field>
 class HermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
 public:
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
@@ -192,8 +182,6 @@ template<class Matrix,class Field>
 class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
 public:
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
@@ -267,8 +255,6 @@ template<class Matrix,class Field>
  class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
 public:
    Matrix &_Mat;
-    virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-    virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
    SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
    virtual  void Mpc      (const Field &in, Field &out) {
      Field tmp(in.Grid());
@@ -295,8 +281,6 @@ template<class Matrix,class Field>
 protected:
    Matrix &_Mat;
 public:
-    virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-    virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
    SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
    
    virtual void Mpc      (const Field &in, Field &out) {
@@ -323,8 +307,6 @@ template<class Matrix,class Field>
 protected:
    Matrix &_Mat;
 public:
-    virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-    virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
    SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
    
    virtual void Mpc      (const Field &in, Field &out) {
@@ -390,8 +372,6 @@ class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase
 {
 public:
  Matrix& _Mat;
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
  virtual void Mpc(const Field& in, Field& out) {
    Field tmp(in.Grid());
@@ -425,8 +405,6 @@ class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Fi
  Matrix &_Mat;
  
 public:
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
  virtual void Mpc(const Field& in, Field& out) {
    Field tmp(in.Grid());
@@ -457,8 +435,6 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
  Matrix& _Mat;
  
 public:
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};

  virtual void Mpc(const Field& in, Field& out) {
@@ -499,8 +475,6 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  Field tmp;
  RealD mass;
 public:
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
  { 
    assert( _Mat.isTrivialEE() );
@@ -48,8 +48,6 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
-  virtual std::vector<int> Directions(void)   =0;
-  virtual std::vector<int> Displacements(void)=0;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -75,8 +73,6 @@ public:
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;

-  virtual std::vector<int> Directions(void)   =0;
-  virtual std::vector<int> Displacements(void)=0;
 };

 NAMESPACE_END(Grid);
@@ -1,4 +1,4 @@
-   /*************************************************************************************
+    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
 #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG

-NAMESPACE_BEGIN(Grid);
  /*
   * Compared to Tang-2009:  P=Pleft. P^T = PRight Q=MssInv. 
   * Script A = SolverMatrix 
@@ -51,54 +50,53 @@ NAMESPACE_BEGIN(Grid);
   * Vout = x
   */

-
-template<class Field, class CoarseField, class Aggregates>
+// abstract base
+template<class Field, class CoarseField>
 class TwoLevelFlexiblePcg : public LinearFunction<Field>
 {
 public:
-
  int verbose;
-
  RealD   Tolerance;
  Integer MaxIterations;
-  const int mmax = 4;
-  GridBase *FineGrid;
-  GridBase *CoarseGrid;
+  const int mmax = 5;
+  GridBase *grid;
+  GridBase *coarsegrid;

-  LinearOperatorBase<Field>   &_Linop;
-  LinearFunction<Field>     &_Smoother;
-  LinearFunction<CoarseField> &_CoarseSolver;
-  Aggregates                  &_Aggregates;
+  LinearOperatorBase<Field>   *_Linop
+  OperatorFunction<Field>     *_Smoother,
+  LinearFunction<CoarseField> *_CoarseSolver;
+
+  // Need somthing that knows how to get from Coarse to fine and back again
  
  // more most opertor functions
  TwoLevelFlexiblePcg(RealD tol,
-		      Integer maxit,
-		      LinearOperatorBase<Field> *Linop,
-		      LinearFunction<Field>   *Smoother,
-		      LinearFunction<CoarseField> *CoarseSolver,
-		      Aggregates *AggP
-		      ) : 
-  Tolerance(tol), 
-    MaxIterations(maxit),
-    _Linop(*Linop),
-    _Smoother(*Smoother),
-    _CoarseSolver(*CoarseSolver),
-    _Aggregates(*AggP)
+		     Integer maxit,
+		     LinearOperatorBase<Field> *Linop,
+		     LinearOperatorBase<Field> *SmootherLinop,
+		     OperatorFunction<Field>   *Smoother,
+		     OperatorFunction<CoarseField>  CoarseLinop
+		     ) : 
+      Tolerance(tol), 
+      MaxIterations(maxit),
+      _Linop(Linop),
+      _PreconditionerLinop(PrecLinop),
+      _Preconditioner(Preconditioner)
  { 
-    CoarseGrid=_Aggregates.CoarseGrid;
-    FineGrid=_Aggregates.FineGrid;
    verbose=0;
  };

  // The Pcg routine is common to all, but the various matrices differ from derived 
  // implementation to derived implmentation
+  void operator() (const Field &src, Field &psi){
  void operator() (const Field &src, Field &psi){

    psi.Checkerboard() = src.Checkerboard();
+    grid             = src.Grid();

+    RealD f;
    RealD rtzp,rtz,a,d,b;
-    //    RealD rptzp;
-    //    RealD tn;
+    RealD rptzp;
+    RealD tn;
    RealD guess = norm2(psi);
    RealD ssq   = norm2(src);
    RealD rsq   = ssq*Tolerance*Tolerance;
@@ -106,15 +104,15 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    /////////////////////////////
    // Set up history vectors
    /////////////////////////////
-    std::vector<Field> p  (mmax,FineGrid);
-    std::vector<Field> mmp(mmax,FineGrid);
+    std::vector<Field> p  (mmax,grid);
+    std::vector<Field> mmp(mmax,grid);
    std::vector<RealD> pAp(mmax);

-    Field x  (FineGrid); x = psi;
-    Field z  (FineGrid);
-    Field tmp(FineGrid);
-    Field r  (FineGrid);
-    Field mu (FineGrid);
+    Field x  (grid); x = psi;
+    Field z  (grid);
+    Field tmp(grid);
+    Field r  (grid);
+    Field mu (grid);
  
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
@@ -123,13 +121,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    Vstart(x,src);

    // r0 = b -A x0
-    _Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
+    HermOp(x,mmp); // Shouldn't this be something else?
    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0

    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
-    M1(r,z);
+    M1(r,z,tmp,mp,SmootherMirs);
    rtzp =real(innerProduct(r,z));

    ///////////////////////////////////////
@@ -145,7 +143,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      int peri_kp = (k+1) % mmax;

      rtz=rtzp;
-      d= M3(p[peri_k],mmp[peri_k]);
+      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
      a = rtz/d;
    
      // Memorise this
@@ -155,13 +153,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);

      // Compute z = M x
-      M1(r,z);
+      M1(r,z,tmp,mp);

      rtzp =real(innerProduct(r,z));

      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate

-      p[peri_kp]=mu;
+      p[peri_kp]=p[peri_k];

      // Standard search direction  p -> z + b p    ; b = 
      b = (rtzp)/rtz;
@@ -183,7 +181,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      // Stopping condition
      if ( rn <= rsq ) { 

-	_Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
+	HermOp(x,mmp); // Shouldn't this be something else?
 	axpy(tmp,-1.0,src,mmp[0]);
 	
 	RealD psinorm = sqrt(norm2(x));
@@ -192,8 +190,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
 	RealD true_residual = tmpnorm/srcnorm;
 	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
 	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
-
-	return;
+	return k;
      }
    }
    // Non-convergence
@@ -202,40 +199,48 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>

 public:

-  virtual void M1(Field & in, Field & out) 
-  {// the smoother
+  virtual void M(Field & in,Field & out,Field & tmp) {
+
+  }
+
+  virtual void M1(Field & in, Field & out) {// the smoother
+
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    Field tmp(FineGrid);
-    Field Min(FineGrid);
+    Field tmp(grid);
+    Field Min(grid);

-    CoarseField PleftProj(CoarseGrid);
-    CoarseField PleftMss_proj(CoarseGrid);
+    PcgM(in,Min); // Smoother call

-    _Smoother(in,Min); // Smoother call
-
-    _Linop.HermOp(Min,out);
+    HermOp(Min,out);
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min

-    _Aggregates.ProjectToSubspace(PleftProj,tmp);     
-    _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    _Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    ProjectToSubspace(tmp,PleftProj);     
+    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    axpy(out,1.0,Min,tmp); // Min+tmp
  }

-  virtual void M2(const Field & in, Field & out) 
-  {
+  virtual void M2(const Field & in, Field & out) {
    out=in;
+    // Must override for Def2 only
+    //  case PcgDef2:
+    //    Pright(in,out);
+    //    break;
  }

-  virtual RealD M3(const Field & p, Field & mmp)
-  {
+  virtual RealD M3(const Field & p, Field & mmp){
    double d,dd;
-    _Linop.HermOpAndNorm(p,mmp,d,dd);
+    HermOpAndNorm(p,mmp,d,dd);
    return dd;
+    // Must override for Def1 only
+    //  case PcgDef1:
+    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
+    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
+    //    Pleft(mp,mmp);
+    //    d=real(linop_d->inner(p,mmp));
  }

-  virtual void Vstart(Field & x,const Field & src)
-  {
+  virtual void VstartDef2(Field & xconst Field & src){
    //case PcgDef2:
    //case PcgAdef2: 
    //case PcgAdef2f:
@@ -251,79 +256,142 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(FineGrid);
-    Field mmp(FineGrid);
-
-    CoarseField PleftProj(CoarseGrid);
-    CoarseField PleftMss_proj(CoarseGrid);
+    Field r(grid);
+    Field mmp(grid);
    
-    _Linop.HermOp(x,mmp);
+    HermOp(x,mmp);
    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
-    _Aggregates.ProjectToSubspace(PleftProj,r);     
-    _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    _Aggregates.PromoteFromSubspace(PleftMss_proj,mmp);  
+    ProjectToSubspace(r,PleftProj);     
+    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    PromoteFromSubspace(PleftMss_proj,mmp);  
    x=x+mmp;

  }

+  virtual void Vstart(Field & x,const Field & src){
+    return;
+  }
+
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout. Override in Def1
  /////////////////////////////////////////////////////////////////////
  virtual void   Vout  (Field & in, Field & out,Field & src){
    out = in;
+    //case PcgDef1:
+    //    //Qb + PT x
+    //    ProjectToSubspace(src,PleftProj);     
+    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    //    PromoteFromSubspace(PleftMss_proj,tmp);  
+    //    
+    //    Pright(in,out);
+    //    
+    //    linop_d->axpy(out,tmp,out,1.0);
+    //    break;
  }

  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Pright and Pleft are common to all implementations
  ////////////////////////////////////////////////////////////////////////////////////////////////
-  virtual void Pright(Field & in,Field & out)
-  {
+  virtual void Pright(Field & in,Field & out){
    // P_R  = [ 1              0 ] 
    //        [ -Mss^-1 Msb    0 ] 
-    Field in_sbar(FineGrid);
+    Field in_sbar(grid);

-    CoarseField PleftProj(CoarseGrid);
-    CoarseField PleftMss_proj(CoarseGrid);
-
-    _Aggregates.ProjectToSubspace(PleftProj,in);     
-    _Aggregates.PromoteFromSubspace(PleftProj,out);  
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 

-    _Linop.HermOp(in_sbar,out);
-    _Aggregates.ProjectToSubspace(PleftProj,out);           // Mssbar in_sbar  (project)
+    HermOp(in_sbar,out);
+    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)

-    _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
-    _Aggregates.PromoteFromSubspace(PleftMss_proj,out);     // 
+    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
+    PromoteFromSubspace(PleftMss_proj,out);     // 

    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
  }
-  virtual void Pleft (Field & in,Field & out)
-  {
+  virtual void Pleft (Field & in,Field & out){
    // P_L  = [ 1  -Mbs Mss^-1] 
    //        [ 0   0         ] 
-    Field in_sbar(FineGrid);
-    Field    tmp2(FineGrid);
-    Field    Mtmp(FineGrid);
+    Field in_sbar(grid);
+    Field    tmp2(grid);
+    Field    Mtmp(grid);

-    CoarseField PleftProj(CoarseGrid);
-    CoarseField PleftMss_proj(CoarseGrid);
-
-    _Aggregates.ProjectToSubspace(PleftProj,in);     
-    _Aggregates.PromoteFromSubspace(PleftProj,out);  
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s

-    _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} in_s
-    _Aggregates.PromoteFromSubspace(PleftMss_proj,out);
+    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
+    PromoteFromSubspace(PleftMss_proj,out);

-    _Linop.HermOp(out,Mtmp);
+    HermOp(out,Mtmp);

-    _Aggregates.ProjectToSubspace(PleftProj,Mtmp);      // Msbar s Mss^{-1}
-    _Aggregates.PromoteFromSubspace(PleftProj,tmp2);
+    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
+    PromoteFromSubspace(PleftProj,tmp2);

    axpy(out,-1.0,tmp2,Mtmp);
    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
  }
-};
-NAMESPACE_END(Grid);
+}

+template<class Field>
+class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp){
+
+  } 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
+
+  }
+  virtual void M2(Field & in, Field & out){
+
+  }
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
+
+  }
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
+
+  }
+}
+/*
+template<class Field>
+class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+*/
 #endif
@@ -60,8 +60,6 @@ public:
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};

  virtual void operator()(const Field &src,Field &guess) {
-    RealD t=-usecond();
-    
    guess = Zero();
    assert(evec.size()==eval.size());
    auto N = evec.size();
@@ -70,8 +68,6 @@ public:
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
    guess.Checkerboard() = src.Checkerboard();
-    t+=usecond();
-    std::cout<<GridLogMessage<<"\t\t\t" << "Deflated guess took "<< t/1000.0<< "ms" <<std::endl;
  }
 };

@@ -59,7 +59,7 @@ public:
    GridBase *grid = src.Grid();
    Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid);
      
-    psi=Zero();
+    psi=zero;
    r  = src;
    Preconditioner(r,p);

@@ -53,11 +53,7 @@ public:
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
-#ifdef GRID_UVM
-    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
-#else 
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
-#endif
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -66,11 +62,7 @@ public:
  { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
-#ifdef GRID_UVM
-    MemoryManager::SharedFree((void *)__p,bytes);
-#else
    MemoryManager::CpuFree((void *)__p,bytes);
-#endif
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
@@ -9,13 +9,11 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
-uint64_t total_cache;
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_cache <<" cache       bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
@@ -37,8 +35,6 @@ void *MemoryManager::AcceleratorAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
-  } else {
-    //    std::cout <<"AcceleratorAllocate: cache hit Device pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
  }
  return ptr;
 }
@@ -57,10 +53,8 @@ void *MemoryManager::SharedAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
-    //    std::cout <<"SharedAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
-  } else {
-    //    std::cout <<"SharedAllocate: cache hit Shared pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
  }
  return ptr;
 }
@@ -80,9 +74,6 @@ void *MemoryManager::CpuAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
-    //    std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl;
-  } else {
-    //    std::cout <<"CpufAllocate: cache hit Cpu pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
  }
  return ptr;
 }
@@ -129,7 +120,7 @@ void MemoryManager::Init(void)
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
    Nc = atoi(str);
-    if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[Cpu]=Nc;
      Ncache[Acc]=Nc;
      Ncache[Shared]=Nc;
@@ -139,7 +130,7 @@ void MemoryManager::Init(void)
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
-    if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
+    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[CpuSmall]=Nc;
      Ncache[AccSmall]=Nc;
      Ncache[SharedSmall]=Nc;
@@ -220,7 +211,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries

  if ( entries[v].valid ) {
    ret = entries[v].address;
-    total_cache-=entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@@ -229,7 +219,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
-  total_cache+=entries[v].bytes;

  return ret;
 }
@@ -254,7 +243,6 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
-      total_cache-=bytes;
      return entries[e].address;
    }
  }
@@ -93,8 +93,8 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;

- public:
  static void PrintBytes(void);
+ public:
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
@@ -138,21 +138,6 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  
-  void SendRecvPacket(void *xmit,
-		      void *recv,
-		      int xmit_to_rank,
-		      int recv_from_rank,
-		      int bytes);
-  
-  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			   void *xmit,
-			   int xmit_to_rank,
-			   void *recv,
-			   int recv_from_rank,
-			   int bytes);
-  
-  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
@@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}

-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int xmit_to_rank,
-					   int recv_from_rank,
-					   int bytes)
-{
-  assert(0);
-}
-

 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0);
-}
-
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  assert(0);
-}
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
-  std::vector<CommsRequest_t> list;
-  // Discard the "dir"
-  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
-  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
-  // Discard the "dir"
-  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
-  SendToRecvFromComplete(waitall);
 }

 void CartesianCommunicator::StencilBarrier(void){};
@@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
 #endif
+#ifdef GRID_HIP
+#include <hip/hip_runtime_api.h>
+#endif

 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
@@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef GRID_CUDA
+#if defined(GRID_CUDA) ||defined(GRID_HIP)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-#ifndef GRID_MPI3_SHM_NONE
-  auto err =  cudaMalloc(&ShmCommBuf, bytes);
-#else
-  auto err =  cudaMallocManaged(&ShmCommBuf, bytes);
-#endif
-  if ( err !=  cudaSuccess) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
-    exit(EXIT_FAILURE);  
-  }
+  ShmCommBuf = acceleratorAllocDevice(bytes);
+
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
+	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);

@@ -475,15 +472,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
+#ifdef GRID_CUDA
    cudaIpcMemHandle_t handle;
-    
    if ( r==WorldShmRank ) { 
-      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
+#endif
+#ifdef GRID_HIP
+    hipIpcMemHandle_t handle;    
+    if ( r==WorldShmRank ) { 
+      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
    //////////////////////////////////////////////////
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
@@ -500,13 +508,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
    void * thisBuf = ShmCommBuf;
+#ifdef GRID_CUDA
    if ( r!=WorldShmRank ) { 
-      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
+#endif
+#ifdef GRID_HIP
+    if ( r!=WorldShmRank ) { 
+      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
+      if ( err !=  hipSuccess) {
+	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+#endif
    ///////////////////////////////////////////////////////////////
    // Save a copy of the device buffers
    ///////////////////////////////////////////////////////////////
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-#include <Grid/lattice/Lattice_reality.h>
+//#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
@@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  autoView( lhs_v , lhs, AcceleratorRead);
  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    auto rhs_t=rhs_v(ss);
+    auto tmp  =ret_v(ss);
    mac(&tmp,&lhs_t,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
  });
@@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto lhs_t=lhs_v(ss);
    mac(&tmp,&lhs_t,&rhs);
    coalescedWrite(ret_v[ss],tmp);
@@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
-    decltype(coalescedRead(obj1())) tmp;
+    auto tmp  =ret_v(ss);
    auto rhs_t=rhs_v(ss);
    mac(&tmp,&lhs,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
@@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);

 #ifdef GRID_HIP
 extern hipDeviceProp_t *gpu_props;
+#define WARP_SIZE 64
 #endif
 #ifdef GRID_CUDA
 extern cudaDeviceProp *gpu_props;
+#define WARP_SIZE 32
 #endif

-#define WARP_SIZE 32
 __device__ unsigned int retirementCount = 0;

 template <class Iterator>
@@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
  
  // cannot use overloaded operators for sobj as they are not volatile-qualified
  memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
-  __syncwarp();
+  acceleratorSynchronise();
  
  const Iterator VEC = WARP_SIZE;
  const Iterator vid = tid & (VEC-1);
@@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
      beta += temp;
      memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
    }
-    __syncwarp();
+    acceleratorSynchronise();
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
  
  if (threadIdx.x == 0) {
    beta  = Zero();
@@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
    }
    memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
  }
-  __syncthreads();
+  acceleratorSynchroniseAll();
 }


@@ -52,7 +52,6 @@ public:
 // This will be safe to call from accelerator_for and is trivially copy constructible
 // The copy constructor for this will need to be used by device lambda functions
 /////////////////////////////////////////////////////////////////////////////////////////
-#undef LATTICE_BOUNDS_CHECK
 template<class vobj> 
 class LatticeView : public LatticeAccelerator<vobj>
 {
@@ -62,36 +61,14 @@ public:
  void * cpu_ptr;
 #ifdef GRID_SIMT
  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { 
-#ifdef LATTICE_BOUNDS_CHECK
-    assert(i<this->_odata_size);
-    assert(i>=0);
-#endif
    return coalescedRead(this->_odata[i]); 
  }
 #else 
-  accelerator_inline const vobj & operator()(size_t i) const {
-#ifdef LATTICE_BOUNDS_CHECK
-    assert(i<this->_odata_size);
-    assert(i>=0);
-#endif
-    return this->_odata[i];
-  }
+  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 #endif

-  accelerator_inline const vobj & operator[](size_t i) const { 
-#ifdef LATTICE_BOUNDS_CHECK
-    assert(i<this->_odata_size);
-    assert(i>=0);
-#endif
-    return this->_odata[i]; 
-  };
-  accelerator_inline vobj       & operator[](size_t i)       { 
-#ifdef LATTICE_BOUNDS_CHECK
-    assert(i<this->_odata_size);
-    assert(i>=0);
-#endif
-    return this->_odata[i]; 
-  };
+  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
+  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };

  accelerator_inline uint64_t begin(void) const { return 0;};
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
@@ -130,6 +130,8 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){

    if ( log.active ) {
+      std::ios_base::fmtflags f(stream.flags());
+
      stream << log.background()<<  std::left;
      if (log.topWidth > 0)
      {
@@ -152,6 +154,8 @@ public:
 	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
+      stream.flags(f);
+
      return stream;
    } else { 
      return devnull;
@@ -77,16 +77,9 @@ const int SpinorIndex = 2;
 template<typename T> struct isSpinor {
  static constexpr bool value = (SpinorIndex==T::TensorLevel);
 };
-const int CoarseIndex = 4;
-template<typename T> struct isCoarsened {
-  static constexpr bool value = (CoarseIndex<=T::TensorLevel);
-};
 template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
 template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

-template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
-template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
-
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -89,8 +89,7 @@ public:
  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
-  virtual std::vector<int> Directions(void)   =0;
-  virtual std::vector<int> Displacements(void)=0;
+

  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

@@ -44,9 +44,6 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;

-  virtual std::vector<int> Directions(void)   { return this->directions; };
-  virtual std::vector<int> Displacements(void){ return this->displacements;};
-
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

@@ -49,9 +49,6 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;

-  virtual std::vector<int> Directions(void)   { return this->directions; };
-  virtual std::vector<int> Displacements(void){ return this->displacements;};
-
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

@@ -47,9 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

-  virtual std::vector<int> Directions(void)   { return this->directions; };
-  virtual std::vector<int> Displacements(void){ return this->displacements;};
-
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
@@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   template<int Naik> 
+   static accelerator_inline
   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
@@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   template<int Naik> accelerator_inline
+   
+   template<int Naik> static accelerator_inline
   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
@@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
+   
   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
@@ -63,9 +63,6 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef WilsonKernels<Impl> Kernels;

-  virtual std::vector<int> Directions(void)   { return this->directions; };
-  virtual std::vector<int> Displacements(void){ return this->displacements;};
-
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -72,9 +72,6 @@ public:
  typedef WilsonKernels<Impl> Kernels;
  PmuStat stat;

-  virtual std::vector<int> Directions(void)   { return this->directions; };
-  virtual std::vector<int> Displacements(void){ return this->displacements;};
-
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

@@ -79,8 +79,6 @@ public:
    _Mat.M(in,tmp);
    G5R5(out,tmp);
  }
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 };


@@ -129,8 +127,6 @@ public:
    _Mat.M(in,tmp);
    out=g5*tmp;
  }
-  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
-  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 };

 NAMESPACE_END(Grid);
@@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for (int s=0;s<Ls;s++) {

@@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);

-  LatticeInteger zz (UGrid);   zz=0.0;
+  PropagatorField  zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);

  for(int s=0;s<Ls;s++){
@@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int sF, int sU, 
@@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,


 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
@@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik>
+template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
@@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
@@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
  });

-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
-template <class Impl>
+template <class Impl> 
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -133,14 +133,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);

-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
+  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));

  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);

-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
 }

 template <class Impl>
@@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
  HAND_RESULT_EXT(ss,F)

 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> void							\
+  template<> accelerator_inline void							\
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<> void						\
+  template<> accelerator_inline void						\
  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl> void 
+template<class Impl> accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_RESULT(ss);
 }

-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_RESULT(ss);
 }

-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT(ss);
 }

-template<class Impl>
+template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_RESULT(ss);
 }

-template<class Impl> void 
+template<class Impl>  accelerator_inline void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT_EXT(ss);
 }

-template<class Impl>
+template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  coalescedWrite(out[sF],result,lane);
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteHalfSpinor *buf, int sF,
 					  int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
  ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  coalescedWrite(out[sF], result,lane);
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  }
 };

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 };

 #define DhopDirMacro(Dir,spProj,spRecon)	\
-  template <class Impl>							\
+  template <class Impl> accelerator_inline				\
  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
@@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
 DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);

-template <class Impl>
+template <class Impl> accelerator_inline
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {
@@ -128,6 +128,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0)-fspin(2);
  hspin(1)=fspin(1)-fspin(3);
 }
@@ -137,50 +138,40 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 *  0 0 -1  0
 *  0 0  0 -1
 */
+
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0);
  hspin(1)=fspin(1);
 }

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(2);
  hspin(1)=fspin(3);
 }
  
+//  template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=fspin(0);
  rfspin(1)=fspin(1);
  rfspin(2)=Zero();
  rfspin(3)=Zero();
 }
+//  template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=Zero();
  rfspin(1)=Zero();
  rfspin(2)=fspin(2);
  rfspin(3)=fspin(3);
 }

-template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
-{
-  const int hN = N>>1;
-  for(int s=0;s<hN;s++){
-    rfspin(s)=fspin(s);
-    rfspin(s+hN)=Zero();
-  }
-}
-template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
-{
-  const int hN = N>>1;
-  for(int s=0;s<hN;s++){
-    rfspin(s)=Zero();
-    rfspin(s+hN)=fspin(s+hN);
-  }
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Reconstruction routines to move back again to four spin
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -192,6 +183,7 @@ template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inlin
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(1));
@@ -199,6 +191,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesI(hspin(1));
@@ -206,6 +199,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(1));
@@ -213,6 +207,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(1));
@@ -226,6 +221,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)= hspin(1);
@@ -233,6 +229,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(1);
@@ -240,6 +237,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(1);
@@ -247,6 +245,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(1);
@@ -261,6 +260,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(0));
@@ -268,6 +268,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=     timesI(hspin(0));
@@ -275,6 +276,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(0));
@@ -282,6 +284,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(0));
@@ -295,6 +298,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=hspin(0);
@@ -302,6 +306,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(0);
@@ -309,6 +314,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(0);
@@ -316,6 +322,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(0);
@@ -329,6 +336,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
  fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
  fspin(2)=Zero();
@@ -336,6 +344,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=Zero();
  fspin(1)=Zero();
  fspin(2)=hspin(0)+hspin(0);
@@ -343,6 +352,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0)+hspin(0);
  fspin(1)+=hspin(1)+hspin(1);
 }
@@ -362,6 +372,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 //////////
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXp(hspin._internal[i],fspin._internal[i]);
  }
@@ -415,21 +426,26 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM
    }}
 }

+
+
 ////////
 // Xm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -439,16 +455,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -457,37 +476,45 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

+
+
 ////////
 // Yp
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -497,16 +524,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -515,55 +545,66 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

+
 ////////
 // Ym
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

+
 template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -572,16 +613,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -594,57 +638,66 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
-  }}
+    }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-  }}
+    }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-  }}
+    }}
 }


@@ -653,53 +706,62 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
-  }}
+    }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
-  }}
+    }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -712,35 +774,41 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
-  }}
+    }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -749,37 +817,44 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

+
 ////////
 // Tm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -789,16 +864,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -807,37 +885,44 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

+
 ////////
 // 5p
 ////////
-template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -846,16 +931,19 @@ template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> acce

 template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -864,16 +952,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr

 template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -881,18 +972,24 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM
 }

 // four spinor projectors for chiral proj
-template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+//  template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+//  template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+//  template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
+template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -904,17 +1001,17 @@ template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inli
 // 5m
 ////////

-template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
  spProj5m(hspin._internal,fspin._internal);
 }
-template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
@@ -924,34 +1021,40 @@ template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> acce

 template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
-  }}
+    }}
 }

 template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
@@ -960,18 +1063,24 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM


 // four spinor projectors for chiral proj
-template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+//  template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5m(hspin._internal,fspin._internal);
 }
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+//  template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+//  template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
+template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
+  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5m(hspin._internal[i][j],fspin._internal[i][j]);
@@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
    decltype(coalescedRead(y_v[ss+sp])) tmp;
-    spProj5m(tmp,y_v(ss+sp)); 
-   tmp = a*x_v(ss+s)+b*tmp;
+    spProj5m(tmp,y_v(ss+sp));
+    tmp = a*x_v(ss+s)+b*tmp;
    coalescedWrite(z_v[ss+s],tmp);
  });
 }
@@ -188,6 +188,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
+  Gamma G5(Gamma::Algebra::Gamma5);
  autoView( x_v, x, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
@@ -195,13 +196,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    uint64_t ss = sss*Ls;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      auto tmp = x_v(ss+s);
-      decltype(tmp) tmp_p;
-      decltype(tmp) tmp_m;
-      spProj5p(tmp_p,tmp);
-      spProj5m(tmp_m,tmp);
-      // Use of spProj5m, 5p captures the coarse space too
-      coalescedWrite(z_v[ss+sp],tmp_p - tmp_m);
+      coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
    }
  });
 }
@@ -213,20 +208,10 @@ void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

-  autoView( x_v, x, AcceleratorRead);
-  autoView( z_v, z, AcceleratorWrite);
-  uint64_t nloop = grid->oSites();
-  accelerator_for(ss,nloop,vobj::Nsimd(),{
-    auto tmp = x_v(ss);
-    decltype(tmp) tmp_p;
-    decltype(tmp) tmp_m;
-    spProj5p(tmp_p,tmp);
-    spProj5m(tmp_m,tmp);
-    coalescedWrite(z_v[ss],tmp_p - tmp_m);
-  });
+  Gamma G5(Gamma::Algebra::Gamma5);
+  z = G5 * x;
 }

-/*
 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
@@ -249,7 +234,6 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
    }
  });
 }
-*/

 NAMESPACE_END(Grid);

@@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

+#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+typedef struct { uint16_t x;} half;
+#endif
+typedef struct Half2_t { half x; half y; } Half2;
+
 #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )

 template<class pair>
@@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
 }

 constexpr int NSIMD_RealH    = COALESCE_GRANULARITY / sizeof(half);
-constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
+constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
 constexpr int NSIMD_RealF    = COALESCE_GRANULARITY / sizeof(float);
 constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
 constexpr int NSIMD_RealD    = COALESCE_GRANULARITY / sizeof(double);
 constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
 constexpr int NSIMD_Integer  = COALESCE_GRANULARITY / sizeof(Integer);

-typedef GpuComplex<half2  > GpuComplexH;
+typedef GpuComplex<Half2  > GpuComplexH;
 typedef GpuComplex<float2 > GpuComplexF;
 typedef GpuComplex<double2> GpuComplexD;

@@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;
 accelerator_inline float half2float(half h)
 {
  float f;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
  f = __half2float(h);
 #else 
-  //f = __half2float(h);
-  __half_raw hr(h);
  Grid_half hh; 
  hh.x = hr.x;
  f=  sfw_half_to_float(hh);
@@ -161,13 +164,11 @@ accelerator_inline float half2float(half h)
 accelerator_inline half float2half(float f)
 {
  half h;
-#ifdef GRID_SIMT
+#if defined(GRID_CUDA) || defined(GRID_HIP)
  h = __float2half(f);
 #else
  Grid_half hh = sfw_float_to_half(f);
-  __half_raw hr;  
-  hr.x = hh.x;
-  h = __half(hr);
+  h.x = hh.x;
 #endif
  return h;
 }
@@ -523,7 +524,7 @@ namespace Optimization {
    ////////////////////////////////////////////////////////////////////////////////////
    // Single / Half
    ////////////////////////////////////////////////////////////////////////////////////
-    static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
+     static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
      int N = GpuVectorCF::N;
      GpuVectorCH h;
      for(int i=0;i<N;i++) {
@@ -55,6 +55,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);

+
 	GPU_PROP_FMT(totalGlobalMem,"%lld");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
@@ -109,20 +110,24 @@ void acceleratorInit(void)
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}

+  printf("world_rank %d has %d devices\n",world_rank,nDevices);
+  size_t totalDeviceMem=0;
  for (int i = 0; i < nDevices; i++) {

 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
    hipGetDeviceProperties(&gpu_props[i], i);
+    hipDeviceProp_t prop; 
+    prop = gpu_props[i];
+    totalDeviceMem = prop.totalGlobalMem;
    if ( world_rank == 0) {
-      hipDeviceProp_t prop; 
-      prop = gpu_props[i];
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device Number    : %d\n", i);
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);

+      GPU_PROP_FMT(totalGlobalMem,"%lu");
      //      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
@@ -131,6 +136,7 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
+  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_IBM_SUMMIT
@@ -151,9 +151,6 @@ inline void *acceleratorAllocShared(size_t bytes)
    ptr = (void *) NULL;
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
-  //  size_t free,total;
-  //  cudaMemGetInfo(&free,&total);
-  //  std::cout << "Malloc managed "<<bytes<<" "<<free<<"/"<<total<<std::endl;
  return ptr;
 };
 inline void *acceleratorAllocDevice(size_t bytes)
@@ -164,9 +161,6 @@ inline void *acceleratorAllocDevice(size_t bytes)
    ptr = (void *) NULL;
    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
-  //  size_t free,total;
-  //  cudaMemGetInfo(&free,&total);
-  //  std::cout << "Malloc device "<<bytes<<" "<<free<<"/"<<total<<std::endl;
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
@@ -313,17 +307,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 inline void *acceleratorAllocShared(size_t bytes)
 {
-#if 0
  void *ptr=NULL;
  auto err = hipMallocManaged((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
-#else
-  return malloc(bytes);
-#endif
 };
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }

@@ -333,7 +323,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
@@ -14,35 +14,62 @@ std::string filestem(const int l)

 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
  Grid_init(&argc,&argv);

-  int64_t threads = GridThread::GetThreads();
+  int64_t          threads = GridThread::GetThreads();
+  auto             mpi     = GridDefaultMpi();
+  std::vector<int> latt;
+
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
+  MSG << "MPI partition " << mpi << std::endl;
+
  MSG << SEP << std::endl;
-  MSG << "Benchmark Lime write" << std::endl;
+  MSG << "Benchmark std write" << std::endl;
  MSG << SEP << std::endl;
  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
  {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    std::cout << "-- Local volume " << l << "^4" << std::endl;
+    MSG << "-- Local volume " << l << "^4" << std::endl;
+    writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
+  }
+
+  MSG << SEP << std::endl;
+  MSG << "Benchmark std read" << std::endl;
+  MSG << SEP << std::endl;
+  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  {
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+    MSG << "-- Local volume " << l << "^4" << std::endl;
+    readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
+  }
+
+#ifdef HAVE_LIME
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime write" << std::endl;
+  MSG << SEP << std::endl;
+  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  {
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+    MSG << "-- Local volume " << l << "^4" << std::endl;
    writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
  }

-  MSG << "Benchmark Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime read" << std::endl;
  MSG << SEP << std::endl;
  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
  {
-    auto             mpi  = GridDefaultMpi();
-    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    std::cout << "-- Local volume " << l << "^4" << std::endl;
+    MSG << "-- Local volume " << l << "^4" << std::endl;
    readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
  }
+#endif

  Grid_finalize();
-#endif
+
  return EXIT_SUCCESS;
 }
@@ -14,13 +14,140 @@ using WriterFn = std::function<void(const std::string, Field &)> ;
 template <typename Field>
 using ReaderFn = std::function<void(Field &, const std::string)>;

+// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
+// 
+// template <typename Field>
+// void stdWrite(const std::string filestem, Field &vec)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
+//   size_t        size;
+//   uint32_t      crc;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   autoView(vec_v, vec, CpuRead);
+//   crcWatch.Start();
+//   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//   std::fwrite(&crc, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+//   ioWatch.Start();
+//   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//   ioWatch.Stop();
+//   std::fclose(file);
+//   size *= vec.Grid()->ProcessorCount();
+//   MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
+//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+//       << " MB/s" << std::endl;
+//   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+//
+// template <typename Field>
+// void stdRead(Field &vec, const std::string filestem)
+// {
+//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
+//   size_t        size;
+//   uint32_t      crcRead, crcData;
+//   GridStopWatch ioWatch, crcWatch;
+
+//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+//   crcWatch.Start();
+//   std::fread(&crcRead, sizeof(uint32_t), 1, file);
+//   crcWatch.Stop();
+//   {
+//     autoView(vec_v, vec, CpuWrite);
+//     ioWatch.Start();
+//     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+//     ioWatch.Stop();
+//     std::fclose(file);
+//   }
+//   {
+//     autoView(vec_v, vec, CpuRead);
+//     crcWatch.Start();
+//     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+//     crcWatch.Stop();
+//   }
+//   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+//   assert(crcData == crcRead);
+//   size *= vec.Grid()->ProcessorCount();
+//   MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
+//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+//       << " MB/s" << std::endl;
+//   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+// }
+
+template <typename Field>
+void stdWrite(const std::string filestem, Field &vec)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crc;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  autoView(vec_v, vec, CpuRead);
+  crcWatch.Start();
+  crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
+  file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
+  ioWatch.Start();
+  file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+  file.flush();
+  ioWatch.Stop();
+  size *= vec.Grid()->ProcessorCount();
+  MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+      << " MB/s" << std::endl;
+  MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
+template <typename Field>
+void stdRead(Field &vec, const std::string filestem)
+{
+  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
+  std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
+  size_t        size, sizec;
+  uint32_t      crcRead, crcData;
+  GridStopWatch ioWatch, crcWatch;
+
+  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  sizec = size/sizeof(char); // just in case of...
+  crcWatch.Start();
+  file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char));
+  crcWatch.Stop();
+  {
+    autoView(vec_v, vec, CpuWrite);
+    ioWatch.Start();
+    file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
+    ioWatch.Stop();
+  }
+  {
+    autoView(vec_v, vec, CpuRead);
+    crcWatch.Start();
+    crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
+    crcWatch.Stop();
+  }
+  MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
+  assert(crcData == crcRead);
+  size *= vec.Grid()->ProcessorCount();
+  MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
+      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
+      << " MB/s" << std::endl;
+  MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
+}
+
 template <typename Field>
 void limeWrite(const std::string filestem, Field &vec)
 {
  emptyUserRecord   record;
  ScidacWriter binWriter(vec.Grid()->IsBoss());

-  binWriter.open(filestem + ".bin");
+  binWriter.open(filestem + ".lime.bin");
  binWriter.writeScidacFieldRecord(vec, record);
  binWriter.close();
 }
@@ -31,7 +158,7 @@ void limeRead(Field &vec, const std::string filestem)
  emptyUserRecord   record;
  ScidacReader binReader;

-  binReader.open(filestem + ".bin");
+  binReader.open(filestem + ".lime.bin");
  binReader.readScidacFieldRecord(vec, record);
  binReader.close();
 }
@@ -8,7 +8,6 @@ using namespace Grid;

 int main (int argc, char ** argv)
 {
-#ifdef HAVE_LIME
  std::vector<std::string> dir;
  unsigned int             Ls;
  bool                     rb;
@@ -34,46 +33,71 @@ int main (int argc, char ** argv)
  }
  Grid_init(&argc,&argv);

-
  int64_t threads = GridThread::GetThreads();
+  auto    mpi     = GridDefaultMpi();
+
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
-  }
+  MSG << "MPI partition " << mpi << std::endl;

  MSG << SEP << std::endl;
-  MSG << "Benchmark double precision Lime read" << std::endl;
+  MSG << "Benchmark Grid std write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   stdWrite<LatticeFermion>, Ls, rb);
+  }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid std read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  stdRead<LatticeFermion>, Ls, rb);
  }

+#ifdef HAVE_LIME
  MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime write" << std::endl;
+  MSG << "Benchmark Grid C-Lime write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                   limeWrite<LatticeFermion>, Ls, rb);
  }
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Grid C-Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
+                                  limeRead<LatticeFermion>, Ls, rb);
+  }
+#endif

-  MSG << SEP << std::endl;
-  MSG << "Benchmark single precision Lime read" << std::endl;
-  MSG << SEP << std::endl;
-  for (auto &d: dir)
-  {
-    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
-  }
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime write" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+  // }
+
+  // MSG << SEP << std::endl;
+  // MSG << "Benchmark single precision Lime read" << std::endl;
+  // MSG << SEP << std::endl;
+  // for (auto &d: dir)
+  // {
+  //   MSG << "-- Directory " << d << std::endl;
+  //   readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
+  // }

  Grid_finalize();
-#endif
+
  return EXIT_SUCCESS;
 }
@@ -36,12 +36,12 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-#define LMAX (48)
+#define LMAX (40)
 #define LMIN (8)
 #define LADD (8)

-  int64_t Nwarm=50;
-  int64_t Nloop=500;
+  int64_t Nwarm=10;
+  int64_t Nloop=100;

  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
@@ -118,6 +118,41 @@ int main (int argc, char ** argv)

    }

+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z=z+ x*y"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
+
+      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+
+      for(int64_t i=0;i<Nwarm;i++){
+	z=z+x*y;
+      }
+      double start=usecond();
+      for(int64_t i=0;i<Nloop;i++){
+	z=z+x*y;
+      }
+      double stop=usecond();
+      double time = (stop-start)/Nloop*1000.0;
+      
+      double bytes=4*vol*Nc*Nc*sizeof(Complex);
+      double flops=Nc*Nc*(6+8+8)*vol;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+
+    }
+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -143,7 +178,6 @@ int main (int argc, char ** argv)
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
-	//	mac(z,x,y);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000.0;
@@ -191,7 +225,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
@@ -216,16 +250,15 @@ int main (int argc, char ** argv)
 	
 	
 	    double bytes=3*vol*Nc*Nc*sizeof(Complex);
-	    double ncbytes=5*vol*Nc*Nc*sizeof(Complex);
 	    double flops=Nc*Nc*(6+8+8)*vol;
-	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<ncbytes/time<<"\t\t" << flops/time<<std::endl;
+	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
      }
  }
 #if 1
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
@@ -259,11 +292,10 @@ int main (int argc, char ** argv)
 	tmult  = tmult /Nloop;
 	
 	double bytes=3*vol*Nc*Nc*sizeof(Complex);
-	double ncbytes=5*vol*Nc*Nc*sizeof(Complex);
 	double flops=Nc*Nc*(6+8+8)*vol;
 	std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
 	time = time * 1000; // convert to NS for GB/s
-	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" <<ncbytes/time<<"\t\t" << flops/time<<std::endl;
+	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
      }
    }
 #endif
@@ -187,7 +187,8 @@ int main (int argc, char ** argv)
 	  auto xx = coalescedRead(x_v[ss]);
 	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = coalescedRead(z_v[ss]);
-	  zz = zz+xx*yy;
+	  //zz = zz+xx*yy;
+	  mac(&zz,&xx,&yy);
 	  coalescedWrite(z_v[ss],zz);
        });
      }
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+awkscript='
+BEGIN{
+  i = 0;
+  print "local L,std read (MB/s),std write (MB/s),Grid Lime read (MB/s),Grid Lime write (MB/s)"
+}
+
+/Benchmark std write/{
+  i    = 0; 
+  mode = "stdWrite";
+} 
+
+/Benchmark std read/{
+  i    = 0; 
+  mode = "stdRead"
+} 
+
+/Benchmark Grid C-Lime write/{
+  i    = 0; 
+  mode = "gridWrite";
+} 
+
+/Benchmark Grid C-Lime read/{
+  i    = 0; 
+  mode = "gridRead";
+}
+
+/Local volume/{
+  match($0, "[0-9]+\\^4");
+  l[i] = substr($0, RSTART, RLENGTH-2);
+}
+
+/MB\/s/{
+  match($0, "[0-9.eE]+ MB/s");
+  p = substr($0, RSTART, RLENGTH-5);
+  if (mode == "stdWrite")
+  {
+    sw[i] = p;
+  }
+  else if (mode == "stdRead")
+  {
+    sr[i] = p;
+  }
+  else if (mode == "gridWrite")
+  {
+    gw[i] = p;
+  }
+  else if (mode == "gridRead")
+  {
+    gr[i] = p;
+  }
+  i++;
+}
+
+END{
+  s = 0
+  for (a in l)
+  {
+    s++;
+  }
+  for (j = 0; j < s; j++)
+  {
+    printf("%s,%s,%s,%s,%s\n", l[j], sr[j], sw[j], gr[j], gw[j]);
+  }
+  printf("\n");
+}
+'
+
+if (( $# != 1 )); then
+    echo "usage: `basename $0` <log file>" 1>&2
+    exit 1
+fi
+LOG=$1
+
+awk "${awkscript}" ${LOG} 
@@ -330,12 +330,18 @@ case ${CXXTEST} in
    fi
    ;;
  hipcc)
-    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
+    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
    CXXLD=${CXX}
    if test $ac_openmp = yes; then
       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
    fi
    ;;
+  dpcpp)
+    LDFLAGS="$LDFLAGS"
+    CXXFLAGS="$CXXFLAGS"
+    CXXLD=${CXX}
+    ;;
  *)
    CXXLD=${CXX}
    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu

 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug

 #### 2. `Release`

 Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release

 #### 3. `MPIDebug`

 Debug configuration with MPI:

-    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx
+    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug

 ### 5.3 Build Grid

@@ -29,91 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Grid.h>

 using namespace Grid;
-
-void MomentumSpacePropagatorTest(RealD mass,RealD M5, LatticePropagator &prop)
-{
-  // what type LatticeComplex 
-  GridBase *_grid = prop.Grid();
-  
-  typedef LatticeFermion FermionField;
-  typedef LatticePropagator PropagatorField;
-  typedef typename FermionField::vector_type vector_type;
-  typedef typename FermionField::scalar_type ScalComplex;
-  typedef iSinglet<ScalComplex> Tcomplex;
-  typedef Lattice<iSinglet<vector_type> > LatComplex;
-  
-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-
-  Coordinate latt_size   = _grid->_fdimensions;
-  
-  PropagatorField   num  (_grid); num  = Zero();
-
-  LatComplex    sk(_grid);  sk = Zero();
-  LatComplex    sk2(_grid); sk2= Zero();
-  LatComplex    W(_grid); W= Zero();
-  LatComplex    a(_grid); a= Zero();
-  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
-  LatComplex denom(_grid); denom= Zero();
-  LatComplex cosha(_grid); 
-  LatComplex kmu(_grid); 
-  LatComplex Wea(_grid); 
-  LatComplex Wema(_grid); 
-
-  ScalComplex ci(0.0,1.0);
-  SpinColourMatrixD identity = ComplexD(1.0);
-
-  for(int mu=0;mu<Nd;mu++) {
-    
-    LatticeCoordinate(kmu,mu);
-    
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    
-    kmu = TwoPiL * kmu;
-    //    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
-    
-    sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
-    sk  = sk  +     sin(kmu)    *sin(kmu); 
-    
-    num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*identity);
-    
-  }
-  
-  W = one - M5 + sk2;
-
-  ////////////////////////////////////////////
-  // Cosh alpha -> alpha
-  ////////////////////////////////////////////
-  cosha =  (one + W*W + sk) / (abs(W)*2.0);
-
-  // FIXME Need a Lattice acosh
-  {
-  autoView(cosha_v,cosha,CpuRead);
-  autoView(a_v,a,CpuWrite);
-  for(int idx=0;idx<_grid->lSites();idx++){
-    Coordinate lcoor(Nd);
-    Tcomplex cc;
-    //    RealD sgn;
-    _grid->LocalIndexToLocalCoor(idx,lcoor);
-    peekLocalSite(cc,cosha_v,lcoor);
-    assert((double)real(cc)>=1.0);
-    assert(fabs((double)imag(cc))<=1.0e-15);
-    cc = ScalComplex(::acosh(real(cc)),0.0);
-    pokeLocalSite(cc,a_v,lcoor);
-  }}
-  
-  Wea = ( exp( a) * abs(W)  );
-  Wema= ( exp(-a) * abs(W)  );
-  
-  num   = num + ( one - Wema ) * mass * identity;
-  denom= ( Wea - one ) + mass*mass * (one - Wema); 
-  prop = num/denom;
-}
-
+ ;

 int main (int argc, char ** argv)
 {
@@ -391,17 +307,6 @@ int main (int argc, char ** argv)
    RealD M5  =0.8;
    DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5);

-    /////////////////// Test code for (1-m)^2 ///////////////
-    LatticePropagatorD prop1(&GRID);
-    LatticePropagatorD prop2(&GRID);
-    LatticeComplexD ratio(&GRID);
-    MomentumSpacePropagatorTest(0.0,M5,prop1);
-    MomentumSpacePropagatorTest(0.3,M5,prop2);
-    ratio=localNorm2(prop2);
-    ratio=ratio/localNorm2(prop1);
-    std::cout << ratio;
-    /////////////////// Test code for (1-m)^2 factor ///////////////
-
    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
    bool fiveD = false; //calculate 4d free propagator
@@ -565,8 +565,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << " Running Multigrid                                "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;

-  BiCGSTAB<CoarseVector>    CoarseMgridBiCGSTAB(0.01,1000,false);     
-  BiCGSTAB<LatticeFermion>  FineMgridBiCGSTAB(0.0,24,false);
+  BiCGSTAB<CoarseVector>    CoarseMgridBiCGSTAB(0.01,1000);     
+  BiCGSTAB<LatticeFermion>  FineMgridBiCGSTAB(0.0,24);
  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  ZeroGuesser<LatticeFermion> FineZeroGuesser;

@@ -590,5 +590,5 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Done "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Grid_finalize();
-
+  
 }
Author	SHA1	Message	Date
portelli	1ba25a0d8c	more I/O benchmark code cleaning	2020-10-07 15:38:41 +01:00
portelli	9ba3647bdf	script to convert I/O benchmark logs to CSV	2020-10-07 15:35:03 +01:00
portelli	5ee832f738	I/O benchmark code cleaning	2020-10-07 15:31:51 +01:00
portelli	e9c5a271a8	fixing potential issues with log alignment and timer I/O	2020-10-06 17:58:16 +01:00
portelli	acac2d6938	standard C/C++ I/O in benchmark	2020-10-06 17:57:00 +01:00
Peter Boyle	ace9cd64bb	dpcpp happy	2020-09-29 08:03:46 -07:00
Peter Boyle	a3e2aeb603	dpcpp options happiness	2020-09-29 06:50:10 -07:00
Peter Boyle	049dd25785	Revert accidental commit thanks michael	2020-09-23 04:13:50 -04:00
Peter Boyle	d43d372294	Merge pull request #311 from mmphys/bugfix/MPIasynch Asynchronous calls removed - reflect this in Communicator_none.cc	2020-09-22 10:41:48 -04:00
Michael Marshall	b71a081cba	Asynchronous calls removed - reflect this in Communicator_none.cc (Opportunistic doc update - OpenMP support on Mac OS)	2020-09-21 09:33:23 +01:00
Peter Boyle	c48909590b	MPI asynch call removal	2020-09-17 20:47:32 +01:00
Peter Boyle	446ef40570	HIP IPC	2020-09-17 20:31:46 +01:00
Peter Boyle	81441e98f4	HIP runs sensible	2020-09-16 03:35:03 +01:00
Peter Boyle	ecd3f890f5	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-09-16 02:30:14 +01:00
Peter Boyle	1c881ce23c	HIP does not like half2 visible members x and y so must define own Half2	2020-09-16 02:28:33 +01:00
Peter Boyle	dacbbdd051	Hip Happy Birthday	2020-09-16 00:37:02 +01:00
Peter Boyle	2859955a03	HIP requires "inline"	2020-09-16 00:36:13 +01:00
Peter Boyle	cc220abd1d	inline for HIP	2020-09-16 00:35:38 +01:00
Peter Boyle	d1c0c0197e	HipCC requires inline on definition	2020-09-16 00:35:06 +01:00
Peter Boyle	fd9424ef27	innlines required to make HIP happy	2020-09-16 00:34:32 +01:00
Peter Boyle	a5c35c4024	Make HIP / Vega happy	2020-09-16 00:33:53 +01:00
Peter Boyle	e03b64dc06	HIP default flaags to work on ROCM	2020-09-16 00:33:09 +01:00
Peter Boyle	4677c40195	HIP improvements	2020-09-16 00:32:27 +01:00
Peter Boyle	288c615782	Hip improvements	2020-09-16 00:31:50 +01:00
Peter Boyle	48e81cf6f8	Hip Pragmas	2020-09-16 00:31:03 +01:00