Improve the matching of stencil coarsening

Improve the coarse matrix calc
Merge branch 'develop' into feature/hw-multigrid
2026-02-24 07:36:13 +00:00 · 2020-09-08 15:36:58 -04:00 · 2020-09-08 15:36:33 -04:00 · 2020-09-03 22:16:50 -04:00 · 2020-09-03 22:11:17 -04:00 · 2020-09-03 22:10:40 -04:00
50 changed files with 1477 additions and 958 deletions
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -34,12 +34,6 @@
 #define __SYCL__REDEFINE__
 #endif

-/* HIP save and restore compile environment*/
-#ifdef GRID_HIP
-#pragma push
-#pragma push_macro("__HIP_DEVICE_COMPILE__")
-#endif
-#define EIGEN_NO_HIP

 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -58,12 +52,6 @@
 #pragma pop
 #endif

-/*HIP restore*/
-#ifdef __HIP__REDEFINE__
-#pragma pop_macro("__HIP_DEVICE_COMPILE__")
-#pragma pop
-#endif
-
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -49,13 +49,11 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
  Lattice<dotp> fine_inner_msk(fine);

  // Multiply could be fused with innerProduct
-  // Single block sum kernel could do both masks.
  fine_inner = localInnerProduct(fineX,fineY);
  mult(fine_inner_msk, fine_inner,FineMask);
  blockSum(CoarseInner,fine_inner_msk);
 }

-
 class Geometry {
 public:
  int npoint;
@@ -80,8 +78,12 @@ public:
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
-  }

+    std::cout <<GridLogMessage << "Geometry "<<std::endl;
+    for(int p=0;p<npoint;p++){
+      std::cout <<GridLogMessage << "point " <<p<<" dir "<<directions[p]<<" delta " <<displacements[p]<<std::endl;
+    }
+  }
 };
  
 template<class Fobj,class CComplex,int nbasis>
@@ -102,8 +104,8 @@ public:
  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
    CoarseGrid(_CoarseGrid),
    FineGrid(_FineGrid),
-    subspace(nbasis,_FineGrid),
-    checkerboard(_checkerboard)
+    checkerboard(_checkerboard),
+    subspace(nbasis,_FineGrid)
  {
  };
  
@@ -285,6 +287,8 @@ public:
  ///////////////////////
  GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know

+  virtual std::vector<int> Directions(void)   { return geom.directions; };
+  virtual std::vector<int> Displacements(void){ return geom.displacements; };
  void M (const CoarseVector &in, CoarseVector &out)
  {
    conformable(_grid,in.Grid());
@@ -308,6 +312,9 @@ public:

    int osites=Grid()->oSites();

+    autoView(st,Stencil,AcceleratorRead);
+    siteVector *CBp=Stencil.CommBuf();
+
    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
      int ss = sss/nbasis;
      int b  = sss%nbasis;
@@ -318,12 +325,12 @@ public:

      for(int point=0;point<geom.npoint;point++){

-	SE=Stencil.GetEntry(ptype,point,ss);
+	SE=st.GetEntry(ptype,point,ss);
 	  
 	if(SE->_is_local) { 
 	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
+	  nbr = coalescedRead(CBp[SE->_offset]);
 	}
 	acceleratorSynchronise();

@@ -332,7 +339,7 @@ public:
 	}
      }
      coalescedWrite(out_v[ss](b),res);
-      });
+    });

    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
  };
@@ -409,38 +416,23 @@ public:
      MdirCalc(in,out[p],p);
    }
  };
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-
+  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp)
+  {
    this->MdirComms(in);

    int ndim = in.Grid()->Nd();

-    //////////////
-    // 4D action like wilson
-    // 0+ => 0 
-    // 0- => 1
-    // 1+ => 2 
-    // 1- => 3
-    // etc..
-    //////////////
-    // 5D action like DWF
-    // 1+ => 0 
-    // 1- => 1
-    // 2+ => 2 
-    // 2- => 3
-    // etc..
-    auto point = [dir, disp, ndim](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else if ( ndim==4 ) { 
-	return (4 * dir + 1 - disp) / 2;
-      } else { 
-	return (4 * (dir-1) + 1 - disp) / 2;
-      }
-    }();
+    int point=-1;
+    for(int p=0;p<geom.npoint;p++){
+      if( (dir==geom.directions[p])&&(disp==geom.displacements[p])) point=p;
+    }
+    assert(point!=-1);// Must find

+    std::cout <<GridLogMessage << "Mdir point "<<point<<" dir "<<dir<<" disp "<<disp  <<std::endl;
+    for(int p=0;p<geom.npoint;p++){
+      std::cout <<GridLogMessage << "point " <<p<<" dir "<<geom.directions[p]<<" delta " <<geom.displacements[p]<<std::endl;
+    }
    MdirCalc(in,out,point);
-
  };

  void Mdiag(const CoarseVector &in, CoarseVector &out)
@@ -456,10 +448,58 @@ public:
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-      A(geom.npoint,&CoarseGrid)
+    A(geom.npoint,&CoarseGrid)
  {
  };

+  void Test(Aggregation<Fobj,CComplex,nbasis> &_Aggregates,GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop)
+  {
+    typedef Lattice<Fobj> FineField;
+    CoarseVector Cin(_grid);
+    CoarseVector Cout(_grid);
+    CoarseVector CFout(_grid);
+
+    FineField Fin(FineGrid);
+    FineField Fout(FineGrid);
+
+
+    std::vector<int> seeds({1,2,3,4,5});
+    GridParallelRNG RNG(_grid);  RNG.SeedFixedIntegers(seeds);
+    gaussian(RNG,Cin);
+
+    _Aggregates.PromoteFromSubspace(Cin,Fin);
+    _Aggregates.ProjectToSubspace(Cin,Fin);
+
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    std::cout << GridLogMessage<< " Testing M  "<<std::endl;
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    // Coarse operator
+    this->M(Cin,Cout);
+    // Fine projected operator
+    _Aggregates.PromoteFromSubspace(Cin,Fin);
+    linop.Op(Fin,Fout);
+    _Aggregates.ProjectToSubspace(CFout,Fout);
+
+    CFout = CFout-Cout;
+    RealD diff = norm2(CFout);
+    std::cout << GridLogMessage<< " diff  "<<diff<<std::endl;
+    assert(diff<1.0e-5);
+
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    std::cout << GridLogMessage<< " Testing Mdag  "<<std::endl;
+    std::cout << GridLogMessage<< "************  "<<std::endl;
+    // Coarse operator
+    Mdag(Cin,Cout);
+    // Fine operator
+    linop.AdjOp(Fin,Fout);
+    _Aggregates.ProjectToSubspace(CFout,Fout);
+
+    CFout = CFout-Cout;
+    diff = norm2(CFout);
+    std::cout << GridLogMessage<< " diff  "<<diff<<std::endl; 
+    assert(diff<1.0e-5);
+  }
+
  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
@@ -496,8 +536,19 @@ public:

    CoarseScalar InnerProd(Grid()); 

+    std::cout << GridLogMessage<< "CoarsenMatrix Orthog " << std::endl;
    // Orthogonalise the subblocks over the basis
    blockOrthogonalise(InnerProd,Subspace.subspace);
+    std::cout << GridLogMessage<< "CoarsenMatrix Orthog done " << std::endl;
+
+    auto OpDirections    = linop.Directions();
+    auto OpDisplacements = linop.Displacements();
+
+    std::cout<<" Coarsening an operator with "<< OpDirections.size()<<" terms "<<std::endl;
+    for(int p=0;p<OpDirections.size();p++) {
+      assert(OpDirections[p]==geom.directions[p]);
+      assert(OpDisplacements[p]==geom.displacements[p]);
+    }

    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
@@ -533,13 +584,27 @@ public:
    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
    oddmask  = one-evenmask;

+    /*
+    {
+      phi=Subspace.subspace[0];
+      linop.OpDirAll(phi,Mphi_p);
+      for(int p=0;p<geom.npoint-1;p++){
+	int dir=geom.directions[p];
+	int disp=geom.displacements[p];
+	linop.OpDir(phi,Mphi,dir,disp);
+	Mphi=Mphi-Mphi_p[p];
+	std::cout << GridLogMessage <<" Direction mapping check " <<norm2(Mphi)<<std::endl;
+      }
+    }
+*/
    assert(self_stencil!=-1);
+    int lhermitian=hermitian;

    for(int i=0;i<nbasis;i++){

      phi=Subspace.subspace[i];

-      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
      linop.OpDirAll(phi,Mphi_p);
      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);

@@ -550,7 +615,7 @@ public:
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];

-	if ( (disp==-1) || (!hermitian ) ) {
+	if ( (disp==-1) || (!lhermitian ) ) {

 	  ////////////////////////////////////////////////////////////////////////
 	  // Pick out contributions coming from this cell and neighbour cell
@@ -568,11 +633,23 @@ public:
 	    autoView( A_self  , A[self_stencil], AcceleratorWrite);

 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
-
+	    if ( lhermitian && (disp==-1) ) {
+	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
+		int dirp   = geom.directions[pp];
+		int dispp  = geom.displacements[pp];
+		if ( (dirp==dir) && (dispp==1) ){
+		  auto sft = conjugate(Cshift(oZProj,dir,1));
+		  autoView( sft_v    ,  sft  , AcceleratorWrite);
+		  autoView( A_pp     ,  A[pp], AcceleratorWrite);
+		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
+		}
+	      }
+	    }
 	  }
 	}
      }

+      std::cout << GridLogMessage<< "CoarsenMatrix Diag "<<std::endl;
      ///////////////////////////////////////////
      // Faster alternate self coupling.. use hermiticity to save 2x
      ///////////////////////////////////////////
@@ -604,31 +681,35 @@ public:

      }
    }
-    if(hermitian) {
-      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
-      ForceHermitian();
+
+    MemoryManager::PrintBytes();
+
+    // Auto self test
+    Test( Subspace,FineGrid,linop);
+
+#if 0
+    ///////////////////////////
+    // test code worth preserving in if block
+    ///////////////////////////
+    std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
+    for(int p=0;p<geom.npoint;p++){
+      std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
+      std::cout<<GridLogMessage<< "\n"<<A[p] << std::endl;
    }
+    std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
+
+    phi=Subspace.subspace[0];
+    std::vector<int> bc(FineGrid->_ndimension,0);
+    blockPick(Grid(),phi,tmp,bc);      // Pick out a block
+    linop.Op(tmp,Mphi);                // Apply big dop
+    blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
+    std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
+    std::cout<<GridLogMessage<< iProj <<std::endl;
+    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
+#endif
+
  }

-  void ForceHermitian(void) {
-    CoarseMatrix Diff  (Grid());
-    for(int p=0;p<geom.npoint;p++){
-      int dir   = geom.directions[p];
-      int disp  = geom.displacements[p];
-      if(disp==-1) {
-	// Find the opposite link
-	for(int pp=0;pp<geom.npoint;pp++){
-	  int dirp   = geom.directions[pp];
-	  int dispp  = geom.displacements[pp];
-	  if ( (dirp==dir) && (dispp==1) ){
-	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
-	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
-	    A[pp] = adj(Cshift(A[p],dir,1));
-	  }
-	}
-      }
-    }
-  }
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -52,6 +52,9 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
+
+  virtual std::vector<int> Directions(void)   =0;
+  virtual std::vector<int> Displacements(void)=0;
 };


@@ -76,6 +79,9 @@ class MdagMLinearOperator : public LinearOperatorBase<Field> {
 public:
  MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};

+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
+
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
    _Mat.Mdiag(in,out);
@@ -111,6 +117,8 @@ class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  RealD _shift;
 public:
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
@@ -151,6 +159,8 @@ template<class Matrix,class Field>
 class HermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
 public:
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
@@ -182,6 +192,8 @@ template<class Matrix,class Field>
 class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
 public:
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
@@ -255,6 +267,8 @@ template<class Matrix,class Field>
  class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
 public:
    Matrix &_Mat;
+    virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+    virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
    SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
    virtual  void Mpc      (const Field &in, Field &out) {
      Field tmp(in.Grid());
@@ -281,6 +295,8 @@ template<class Matrix,class Field>
 protected:
    Matrix &_Mat;
 public:
+    virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+    virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
    SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
    
    virtual void Mpc      (const Field &in, Field &out) {
@@ -307,6 +323,8 @@ template<class Matrix,class Field>
 protected:
    Matrix &_Mat;
 public:
+    virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+    virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
    SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
    
    virtual void Mpc      (const Field &in, Field &out) {
@@ -372,6 +390,8 @@ class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase
 {
 public:
  Matrix& _Mat;
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
  virtual void Mpc(const Field& in, Field& out) {
    Field tmp(in.Grid());
@@ -405,6 +425,8 @@ class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Fi
  Matrix &_Mat;
  
 public:
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
  virtual void Mpc(const Field& in, Field& out) {
    Field tmp(in.Grid());
@@ -435,6 +457,8 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
  Matrix& _Mat;
  
 public:
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};

  virtual void Mpc(const Field& in, Field& out) {
@@ -475,6 +499,8 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  Field tmp;
  RealD mass;
 public:
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
  SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
  { 
    assert( _Mat.isTrivialEE() );
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -48,6 +48,8 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
+  virtual std::vector<int> Directions(void)   =0;
+  virtual std::vector<int> Displacements(void)=0;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -73,6 +75,8 @@ public:
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;

+  virtual std::vector<int> Directions(void)   =0;
+  virtual std::vector<int> Displacements(void)=0;
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+   /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
 #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG

+NAMESPACE_BEGIN(Grid);
  /*
   * Compared to Tang-2009:  P=Pleft. P^T = PRight Q=MssInv. 
   * Script A = SolverMatrix 
@@ -50,53 +51,54 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * Vout = x
   */

-// abstract base
-template<class Field, class CoarseField>
+
+template<class Field, class CoarseField, class Aggregates>
 class TwoLevelFlexiblePcg : public LinearFunction<Field>
 {
 public:
+
  int verbose;
+
  RealD   Tolerance;
  Integer MaxIterations;
-  const int mmax = 5;
-  GridBase *grid;
-  GridBase *coarsegrid;
+  const int mmax = 4;
+  GridBase *FineGrid;
+  GridBase *CoarseGrid;

-  LinearOperatorBase<Field>   *_Linop
-  OperatorFunction<Field>     *_Smoother,
-  LinearFunction<CoarseField> *_CoarseSolver;
-
-  // Need somthing that knows how to get from Coarse to fine and back again
+  LinearOperatorBase<Field>   &_Linop;
+  LinearFunction<Field>     &_Smoother;
+  LinearFunction<CoarseField> &_CoarseSolver;
+  Aggregates                  &_Aggregates;
  
  // more most opertor functions
  TwoLevelFlexiblePcg(RealD tol,
-		     Integer maxit,
-		     LinearOperatorBase<Field> *Linop,
-		     LinearOperatorBase<Field> *SmootherLinop,
-		     OperatorFunction<Field>   *Smoother,
-		     OperatorFunction<CoarseField>  CoarseLinop
-		     ) : 
-      Tolerance(tol), 
-      MaxIterations(maxit),
-      _Linop(Linop),
-      _PreconditionerLinop(PrecLinop),
-      _Preconditioner(Preconditioner)
+		      Integer maxit,
+		      LinearOperatorBase<Field> *Linop,
+		      LinearFunction<Field>   *Smoother,
+		      LinearFunction<CoarseField> *CoarseSolver,
+		      Aggregates *AggP
+		      ) : 
+  Tolerance(tol), 
+    MaxIterations(maxit),
+    _Linop(*Linop),
+    _Smoother(*Smoother),
+    _CoarseSolver(*CoarseSolver),
+    _Aggregates(*AggP)
  { 
+    CoarseGrid=_Aggregates.CoarseGrid;
+    FineGrid=_Aggregates.FineGrid;
    verbose=0;
  };

  // The Pcg routine is common to all, but the various matrices differ from derived 
  // implementation to derived implmentation
-  void operator() (const Field &src, Field &psi){
  void operator() (const Field &src, Field &psi){

    psi.Checkerboard() = src.Checkerboard();
-    grid             = src.Grid();

-    RealD f;
    RealD rtzp,rtz,a,d,b;
-    RealD rptzp;
-    RealD tn;
+    //    RealD rptzp;
+    //    RealD tn;
    RealD guess = norm2(psi);
    RealD ssq   = norm2(src);
    RealD rsq   = ssq*Tolerance*Tolerance;
@@ -104,15 +106,15 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    /////////////////////////////
    // Set up history vectors
    /////////////////////////////
-    std::vector<Field> p  (mmax,grid);
-    std::vector<Field> mmp(mmax,grid);
+    std::vector<Field> p  (mmax,FineGrid);
+    std::vector<Field> mmp(mmax,FineGrid);
    std::vector<RealD> pAp(mmax);

-    Field x  (grid); x = psi;
-    Field z  (grid);
-    Field tmp(grid);
-    Field r  (grid);
-    Field mu (grid);
+    Field x  (FineGrid); x = psi;
+    Field z  (FineGrid);
+    Field tmp(FineGrid);
+    Field r  (FineGrid);
+    Field mu (FineGrid);
  
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
@@ -121,13 +123,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    Vstart(x,src);

    // r0 = b -A x0
-    HermOp(x,mmp); // Shouldn't this be something else?
+    _Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0

    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
-    M1(r,z,tmp,mp,SmootherMirs);
+    M1(r,z);
    rtzp =real(innerProduct(r,z));

    ///////////////////////////////////////
@@ -143,7 +145,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      int peri_kp = (k+1) % mmax;

      rtz=rtzp;
-      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
+      d= M3(p[peri_k],mmp[peri_k]);
      a = rtz/d;
    
      // Memorise this
@@ -153,13 +155,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);

      // Compute z = M x
-      M1(r,z,tmp,mp);
+      M1(r,z);

      rtzp =real(innerProduct(r,z));

      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate

-      p[peri_kp]=p[peri_k];
+      p[peri_kp]=mu;

      // Standard search direction  p -> z + b p    ; b = 
      b = (rtzp)/rtz;
@@ -181,7 +183,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
      // Stopping condition
      if ( rn <= rsq ) { 

-	HermOp(x,mmp); // Shouldn't this be something else?
+	_Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
 	axpy(tmp,-1.0,src,mmp[0]);
 	
 	RealD psinorm = sqrt(norm2(x));
@@ -190,7 +192,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
 	RealD true_residual = tmpnorm/srcnorm;
 	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
 	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
-	return k;
+
+	return;
      }
    }
    // Non-convergence
@@ -199,48 +202,40 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>

 public:

-  virtual void M(Field & in,Field & out,Field & tmp) {
-
-  }
-
-  virtual void M1(Field & in, Field & out) {// the smoother
-
+  virtual void M1(Field & in, Field & out) 
+  {// the smoother
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-    Field tmp(grid);
-    Field Min(grid);
+    Field tmp(FineGrid);
+    Field Min(FineGrid);

-    PcgM(in,Min); // Smoother call
+    CoarseField PleftProj(CoarseGrid);
+    CoarseField PleftMss_proj(CoarseGrid);

-    HermOp(Min,out);
+    _Smoother(in,Min); // Smoother call
+
+    _Linop.HermOp(Min,out);
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min

-    ProjectToSubspace(tmp,PleftProj);     
-    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    _Aggregates.ProjectToSubspace(PleftProj,tmp);     
+    _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    _Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    axpy(out,1.0,Min,tmp); // Min+tmp
  }

-  virtual void M2(const Field & in, Field & out) {
+  virtual void M2(const Field & in, Field & out) 
+  {
    out=in;
-    // Must override for Def2 only
-    //  case PcgDef2:
-    //    Pright(in,out);
-    //    break;
  }

-  virtual RealD M3(const Field & p, Field & mmp){
+  virtual RealD M3(const Field & p, Field & mmp)
+  {
    double d,dd;
-    HermOpAndNorm(p,mmp,d,dd);
+    _Linop.HermOpAndNorm(p,mmp,d,dd);
    return dd;
-    // Must override for Def1 only
-    //  case PcgDef1:
-    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
-    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
-    //    Pleft(mp,mmp);
-    //    d=real(linop_d->inner(p,mmp));
  }

-  virtual void VstartDef2(Field & xconst Field & src){
+  virtual void Vstart(Field & x,const Field & src)
+  {
    //case PcgDef2:
    //case PcgAdef2: 
    //case PcgAdef2f:
@@ -256,142 +251,79 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(grid);
-    Field mmp(grid);
+    Field r(FineGrid);
+    Field mmp(FineGrid);
+
+    CoarseField PleftProj(CoarseGrid);
+    CoarseField PleftMss_proj(CoarseGrid);
    
-    HermOp(x,mmp);
+    _Linop.HermOp(x,mmp);
    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
-    ProjectToSubspace(r,PleftProj);     
-    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    PromoteFromSubspace(PleftMss_proj,mmp);  
+    _Aggregates.ProjectToSubspace(PleftProj,r);     
+    _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    _Aggregates.PromoteFromSubspace(PleftMss_proj,mmp);  
    x=x+mmp;

  }

-  virtual void Vstart(Field & x,const Field & src){
-    return;
-  }
-
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout. Override in Def1
  /////////////////////////////////////////////////////////////////////
  virtual void   Vout  (Field & in, Field & out,Field & src){
    out = in;
-    //case PcgDef1:
-    //    //Qb + PT x
-    //    ProjectToSubspace(src,PleftProj);     
-    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    //    PromoteFromSubspace(PleftMss_proj,tmp);  
-    //    
-    //    Pright(in,out);
-    //    
-    //    linop_d->axpy(out,tmp,out,1.0);
-    //    break;
  }

  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Pright and Pleft are common to all implementations
  ////////////////////////////////////////////////////////////////////////////////////////////////
-  virtual void Pright(Field & in,Field & out){
+  virtual void Pright(Field & in,Field & out)
+  {
    // P_R  = [ 1              0 ] 
    //        [ -Mss^-1 Msb    0 ] 
-    Field in_sbar(grid);
+    Field in_sbar(FineGrid);

-    ProjectToSubspace(in,PleftProj);     
-    PromoteFromSubspace(PleftProj,out);  
+    CoarseField PleftProj(CoarseGrid);
+    CoarseField PleftMss_proj(CoarseGrid);
+
+    _Aggregates.ProjectToSubspace(PleftProj,in);     
+    _Aggregates.PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 

-    HermOp(in_sbar,out);
-    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
+    _Linop.HermOp(in_sbar,out);
+    _Aggregates.ProjectToSubspace(PleftProj,out);           // Mssbar in_sbar  (project)

-    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
-    PromoteFromSubspace(PleftMss_proj,out);     // 
+    _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
+    _Aggregates.PromoteFromSubspace(PleftMss_proj,out);     // 

    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
  }
-  virtual void Pleft (Field & in,Field & out){
+  virtual void Pleft (Field & in,Field & out)
+  {
    // P_L  = [ 1  -Mbs Mss^-1] 
    //        [ 0   0         ] 
-    Field in_sbar(grid);
-    Field    tmp2(grid);
-    Field    Mtmp(grid);
+    Field in_sbar(FineGrid);
+    Field    tmp2(FineGrid);
+    Field    Mtmp(FineGrid);

-    ProjectToSubspace(in,PleftProj);     
-    PromoteFromSubspace(PleftProj,out);  
+    CoarseField PleftProj(CoarseGrid);
+    CoarseField PleftMss_proj(CoarseGrid);
+
+    _Aggregates.ProjectToSubspace(PleftProj,in);     
+    _Aggregates.PromoteFromSubspace(PleftProj,out);  
    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s

-    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
-    PromoteFromSubspace(PleftMss_proj,out);
+    _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} in_s
+    _Aggregates.PromoteFromSubspace(PleftMss_proj,out);

-    HermOp(out,Mtmp);
+    _Linop.HermOp(out,Mtmp);

-    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
-    PromoteFromSubspace(PleftProj,tmp2);
+    _Aggregates.ProjectToSubspace(PleftProj,Mtmp);      // Msbar s Mss^{-1}
+    _Aggregates.PromoteFromSubspace(PleftProj,tmp2);

    axpy(out,-1.0,tmp2,Mtmp);
    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
  }
-}
+};
+NAMESPACE_END(Grid);

-template<class Field>
-class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp){
-
-  } 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
-
-  }
-  virtual void M2(Field & in, Field & out){
-
-  }
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
-
-  }
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
-
-  }
-}
-/*
-template<class Field>
-class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-
-template<class Field>
-class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
- public:
-  virtual void M(Field & in,Field & out,Field & tmp); 
-  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
-  virtual void M2(Field & in, Field & out);
-  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
-  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
-}
-*/
 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -60,6 +60,8 @@ public:
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};

  virtual void operator()(const Field &src,Field &guess) {
+    RealD t=-usecond();
+    
    guess = Zero();
    assert(evec.size()==eval.size());
    auto N = evec.size();
@@ -68,6 +70,8 @@ public:
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
    guess.Checkerboard() = src.Checkerboard();
+    t+=usecond();
+    std::cout<<GridLogMessage<<"\t\t\t" << "Deflated guess took "<< t/1000.0<< "ms" <<std::endl;
  }
 };

--- a/Grid/algorithms/iterative/PrecConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecConjugateResidual.h
@@ -59,7 +59,7 @@ public:
    GridBase *grid = src.Grid();
    Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid);
      
-    psi=zero;
+    psi=Zero();
    r  = src;
    Preconditioner(r,p);

--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -53,7 +53,11 @@ public:
  { 
    size_type bytes = __n*sizeof(_Tp);
    profilerAllocate(bytes);
+#ifdef GRID_UVM
+    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
+#else 
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
+#endif
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
    return ptr;
  }
@@ -62,7 +66,11 @@ public:
  { 
    size_type bytes = __n * sizeof(_Tp);
    profilerFree(bytes);
+#ifdef GRID_UVM
+    MemoryManager::SharedFree((void *)__p,bytes);
+#else
    MemoryManager::CpuFree((void *)__p,bytes);
+#endif
  }

  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -9,11 +9,13 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
+uint64_t total_cache;
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
+  std::cout << " MemoryManager : "<<total_cache <<" cache       bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
@@ -35,6 +37,8 @@ void *MemoryManager::AcceleratorAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
+  } else {
+    //    std::cout <<"AcceleratorAllocate: cache hit Device pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
  }
  return ptr;
 }
@@ -53,8 +57,10 @@ void *MemoryManager::SharedAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
-    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+    //    std::cout <<"SharedAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
+  } else {
+    //    std::cout <<"SharedAllocate: cache hit Shared pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
  }
  return ptr;
 }
@@ -74,6 +80,9 @@ void *MemoryManager::CpuAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
+    //    std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+  } else {
+    //    std::cout <<"CpufAllocate: cache hit Cpu pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
  }
  return ptr;
 }
@@ -120,7 +129,7 @@ void MemoryManager::Init(void)
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
    Nc = atoi(str);
-    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+    if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
      Ncache[Cpu]=Nc;
      Ncache[Acc]=Nc;
      Ncache[Shared]=Nc;
@@ -130,7 +139,7 @@ void MemoryManager::Init(void)
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
-    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
+    if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
      Ncache[CpuSmall]=Nc;
      Ncache[AccSmall]=Nc;
      Ncache[SharedSmall]=Nc;
@@ -211,6 +220,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries

  if ( entries[v].valid ) {
    ret = entries[v].address;
+    total_cache-=entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@@ -219,6 +229,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
+  total_cache+=entries[v].bytes;

  return ret;
 }
@@ -243,6 +254,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
+      total_cache-=bytes;
      return entries[e].address;
    }
  }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -93,8 +93,8 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;

-  static void PrintBytes(void);
 public:
+  static void PrintBytes(void);
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -138,6 +138,21 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  
+  void SendRecvPacket(void *xmit,
+		      void *recv,
+		      int xmit_to_rank,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int xmit_to_rank,
+			   void *recv,
+			   int recv_from_rank,
+			   int bytes);
+  
+  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,
 			       void *recv,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -77,6 +77,15 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}

+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int xmit_to_rank,
+					   int recv_from_rank,
+					   int bytes)
+{
+  assert(0);
+}
+

 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -87,6 +96,20 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0);
+}
+
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  assert(0);
+}
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
@@ -114,6 +137,10 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int recv_from_rank,
 						     int bytes, int dir)
 {
+  std::vector<CommsRequest_t> list;
+  // Discard the "dir"
+  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -123,10 +150,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
+  // Discard the "dir"
+  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
+  SendToRecvFromComplete(waitall);
 }

 void CartesianCommunicator::StencilBarrier(void){};
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -32,9 +32,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
 #endif
-#ifdef GRID_HIP
-#include <hip/hip_runtime_api.h>
-#endif

 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
@@ -428,7 +425,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#if defined(GRID_CUDA) ||defined(GRID_HIP)
+#ifdef GRID_CUDA
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -451,15 +448,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-  ShmCommBuf = acceleratorAllocDevice(bytes);
-
+#ifndef GRID_MPI3_SHM_NONE
+  auto err =  cudaMalloc(&ShmCommBuf, bytes);
+#else
+  auto err =  cudaMallocManaged(&ShmCommBuf, bytes);
+#endif
+  if ( err !=  cudaSuccess) {
+    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
+    exit(EXIT_FAILURE);  
+  }
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);

@@ -472,26 +475,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
-#ifdef GRID_CUDA
    cudaIpcMemHandle_t handle;
+    
    if ( r==WorldShmRank ) { 
-      auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
+      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
-#endif
-#ifdef GRID_HIP
-    hipIpcMemHandle_t handle;    
-    if ( r==WorldShmRank ) { 
-      auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
-      if ( err !=  hipSuccess) {
-	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
-	exit(EXIT_FAILURE);
-      }
-    }
-#endif
    //////////////////////////////////////////////////
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
@@ -508,24 +500,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
    void * thisBuf = ShmCommBuf;
-#ifdef GRID_CUDA
    if ( r!=WorldShmRank ) { 
-      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
+      err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
      if ( err !=  cudaSuccess) {
 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
 	exit(EXIT_FAILURE);
      }
    }
-#endif
-#ifdef GRID_HIP
-    if ( r!=WorldShmRank ) { 
-      auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
-      if ( err !=  hipSuccess) {
-	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
-	exit(EXIT_FAILURE);
-      }
-    }
-#endif
    ///////////////////////////////////////////////////////////////
    // Save a copy of the device buffers
    ///////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-//#include <Grid/lattice/Lattice_reality.h>
+#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_real_imag.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  autoView( lhs_v , lhs, AcceleratorRead);
  autoView( rhs_v , rhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
+    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    auto rhs_t=rhs_v(ss);
-    auto tmp  =ret_v(ss);
    mac(&tmp,&lhs_t,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
  });
@@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
-    auto tmp  =ret_v(ss);
+    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
    mac(&tmp,&lhs_t,&rhs);
    coalescedWrite(ret_v[ss],tmp);
@@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( rhs_v , lhs, AcceleratorRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
-    auto tmp  =ret_v(ss);
+    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
    mac(&tmp,&lhs,&rhs_t);
    coalescedWrite(ret_v[ss],tmp);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -2,13 +2,12 @@ NAMESPACE_BEGIN(Grid);

 #ifdef GRID_HIP
 extern hipDeviceProp_t *gpu_props;
-#define WARP_SIZE 64
 #endif
 #ifdef GRID_CUDA
 extern cudaDeviceProp *gpu_props;
-#define WARP_SIZE 32
 #endif

+#define WARP_SIZE 32
 __device__ unsigned int retirementCount = 0;

 template <class Iterator>
@@ -65,7 +64,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
  
  // cannot use overloaded operators for sobj as they are not volatile-qualified
  memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
-  acceleratorSynchronise();
+  __syncwarp();
  
  const Iterator VEC = WARP_SIZE;
  const Iterator vid = tid & (VEC-1);
@@ -79,9 +78,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
      beta += temp;
      memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
    }
-    acceleratorSynchronise();
+    __syncwarp();
  }
-  acceleratorSynchroniseAll();
+  __syncthreads();
  
  if (threadIdx.x == 0) {
    beta  = Zero();
@@ -91,7 +90,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
    }
    memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
  }
-  acceleratorSynchroniseAll();
+  __syncthreads();
 }


--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -52,6 +52,7 @@ public:
 // This will be safe to call from accelerator_for and is trivially copy constructible
 // The copy constructor for this will need to be used by device lambda functions
 /////////////////////////////////////////////////////////////////////////////////////////
+#undef LATTICE_BOUNDS_CHECK
 template<class vobj> 
 class LatticeView : public LatticeAccelerator<vobj>
 {
@@ -61,14 +62,36 @@ public:
  void * cpu_ptr;
 #ifdef GRID_SIMT
  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { 
+#ifdef LATTICE_BOUNDS_CHECK
+    assert(i<this->_odata_size);
+    assert(i>=0);
+#endif
    return coalescedRead(this->_odata[i]); 
  }
 #else 
-  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
+  accelerator_inline const vobj & operator()(size_t i) const {
+#ifdef LATTICE_BOUNDS_CHECK
+    assert(i<this->_odata_size);
+    assert(i>=0);
+#endif
+    return this->_odata[i];
+  }
 #endif

-  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
-  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
+  accelerator_inline const vobj & operator[](size_t i) const { 
+#ifdef LATTICE_BOUNDS_CHECK
+    assert(i<this->_odata_size);
+    assert(i>=0);
+#endif
+    return this->_odata[i]; 
+  };
+  accelerator_inline vobj       & operator[](size_t i)       { 
+#ifdef LATTICE_BOUNDS_CHECK
+    assert(i<this->_odata_size);
+    assert(i>=0);
+#endif
+    return this->_odata[i]; 
+  };

  accelerator_inline uint64_t begin(void) const { return 0;};
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -130,8 +130,6 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){

    if ( log.active ) {
-      std::ios_base::fmtflags f(stream.flags());
-
      stream << log.background()<<  std::left;
      if (log.topWidth > 0)
      {
@@ -154,8 +152,6 @@ public:
 	       << now	       << log.background() << " : " ;
      }
      stream << log.colour();
-      stream.flags(f);
-
      return stream;
    } else { 
      return devnull;
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -77,9 +77,16 @@ const int SpinorIndex = 2;
 template<typename T> struct isSpinor {
  static constexpr bool value = (SpinorIndex==T::TensorLevel);
 };
+const int CoarseIndex = 4;
+template<typename T> struct isCoarsened {
+  static constexpr bool value = (CoarseIndex<=T::TensorLevel);
+};
 template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
 template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

+template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
+template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -89,7 +89,8 @@ public:
  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
-
+  virtual std::vector<int> Directions(void)   =0;
+  virtual std::vector<int> Displacements(void)=0;

  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -44,6 +44,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;

+  virtual std::vector<int> Directions(void)   { return this->directions; };
+  virtual std::vector<int> Displacements(void){ return this->displacements;};
+
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -49,6 +49,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef StaggeredKernels<Impl> Kernels;

+  virtual std::vector<int> Directions(void)   { return this->directions; };
+  virtual std::vector<int> Displacements(void){ return this->displacements;};
+
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -47,6 +47,9 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

+  virtual std::vector<int> Directions(void)   { return this->directions; };
+  virtual std::vector<int> Displacements(void){ return this->displacements;};
+
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -63,20 +63,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   template<int Naik> 
-   static accelerator_inline
+   template<int Naik> accelerator_inline
   void DhopSiteGeneric(StencilView &st, 
 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   
-   template<int Naik> static accelerator_inline
+   template<int Naik> accelerator_inline
   void DhopSiteGenericInt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionFieldView &in, FermionFieldView &out,int dag);
-   
-   template<int Naik> static accelerator_inline
+   template<int Naik> accelerator_inline
   void DhopSiteGenericExt(StencilView &st, 
 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
@@ -85,20 +82,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   
-   template<int Naik> static accelerator_inline
+   template<int Naik> accelerator_inline
   void DhopSiteHand(StencilView &st, 
 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionFieldView &in, FermionFieldView &out,int dag);
-   
-   template<int Naik> static accelerator_inline
+   template<int Naik> accelerator_inline
   void DhopSiteHandInt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionFieldView &in, FermionFieldView &out,int dag);
-   
-   template<int Naik> static accelerator_inline
+   template<int Naik> accelerator_inline
   void DhopSiteHandExt(StencilView &st, 
 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
@@ -107,7 +101,6 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
-   
   void DhopSiteAsm(StencilView &st, 
 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -63,6 +63,9 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef WilsonKernels<Impl> Kernels;

+  virtual std::vector<int> Directions(void)   { return this->directions; };
+  virtual std::vector<int> Displacements(void){ return this->displacements;};
+
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -72,6 +72,9 @@ public:
  typedef WilsonKernels<Impl> Kernels;
  PmuStat stat;

+  virtual std::vector<int> Directions(void)   { return this->directions; };
+  virtual std::vector<int> Displacements(void){ return this->displacements;};
+
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -79,6 +79,8 @@ public:
    _Mat.M(in,tmp);
    G5R5(out,tmp);
  }
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 };


@@ -127,6 +129,8 @@ public:
    _Mat.M(in,tmp);
    out=g5*tmp;
  }
+  virtual std::vector<int> Directions(void)   { return _Mat.Directions();};
+  virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 };

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
-  PropagatorField zz (UGrid);   zz=0.0;
+  LatticeInteger zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for (int s=0;s<Ls;s++) {

@@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);

-  PropagatorField  zz (UGrid);   zz=0.0;
+  LatticeInteger zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);

  for(int s=0;s<Ls;s++){
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);


 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int sF, int sU, 
@@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,


 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,


 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
@@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
@@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
-template <int Naik> accelerator_inline
+template <int Naik>
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
@@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
-template <class Impl> 
+template <class Impl>
 void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
@@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
  });

-template <class Impl> 
+template <class Impl>
 void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
-template <class Impl> 
+template <class Impl>
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 				       DoubledGaugeField &U,
 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -133,14 +133,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);

-  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
-  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));

  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);

-  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
 }

 template <class Impl>
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
  HAND_RESULT_EXT(ss,F)

 #define HAND_SPECIALISE_GPARITY(IMPL)					\
-  template<> accelerator_inline void						\
+  template<> void						\
  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> accelerator_inline void						\
+  template<> void						\
  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> accelerator_inline void						\
+  template<> void						\
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> accelerator_inline void						\
+  template<> void						\
  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
-  template<> accelerator_inline void							\
+  template<> void							\
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
@@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
-  template<> accelerator_inline void						\
+  template<> void						\
  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl> accelerator_inline void 
+template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_RESULT(ss);
 }

-template<class Impl>  accelerator_inline
+template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_RESULT(ss);
 }

-template<class Impl>  accelerator_inline void 
+template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT(ss);
 }

-template<class Impl> accelerator_inline
+template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_RESULT(ss);
 }

-template<class Impl>  accelerator_inline void 
+template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
@@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_RESULT_EXT(ss);
 }

-template<class Impl>  accelerator_inline
+template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
-template <class Impl> accelerator_inline
+template <class Impl>
 void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
  coalescedWrite(out[sF],result,lane);
 };

-template <class Impl> accelerator_inline
+template <class Impl>
 void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteHalfSpinor *buf, int sF,
 					  int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
  ////////////////////////////////////////////////////////////////////
  // Interior kernels
  ////////////////////////////////////////////////////////////////////
-template <class Impl> accelerator_inline
+template <class Impl>
 void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
  coalescedWrite(out[sF], result,lane);
 };

-template <class Impl> accelerator_inline
+template <class Impl>
 void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 							 SiteHalfSpinor *buf, int sF,
 							 int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
 ////////////////////////////////////////////////////////////////////
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
-template <class Impl> accelerator_inline
+template <class Impl>
 void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
 						int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
  }
 };

-template <class Impl> accelerator_inline
+template <class Impl>
 void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
 					     int sU, const FermionFieldView &in, FermionFieldView &out)
@@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 };

 #define DhopDirMacro(Dir,spProj,spRecon)	\
-  template <class Impl> accelerator_inline				\
+  template <class Impl>							\
  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
@@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
 DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);

-template <class Impl> accelerator_inline
+template <class Impl>
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {
--- a/Grid/qcd/spin/TwoSpinor.h
+++ b/Grid/qcd/spin/TwoSpinor.h
@@ -128,7 +128,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0)-fspin(2);
  hspin(1)=fspin(1)-fspin(3);
 }
@@ -138,40 +137,50 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 *  0 0 -1  0
 *  0 0  0 -1
 */
-
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0);
  hspin(1)=fspin(1);
 }

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(2);
  hspin(1)=fspin(3);
 }
  
-//  template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=fspin(0);
  rfspin(1)=fspin(1);
  rfspin(2)=Zero();
  rfspin(3)=Zero();
 }
-//  template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=Zero();
  rfspin(1)=Zero();
  rfspin(2)=fspin(2);
  rfspin(3)=fspin(3);
 }

+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=fspin(s);
+    rfspin(s+hN)=Zero();
+  }
+}
+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=Zero();
+    rfspin(s+hN)=fspin(s+hN);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Reconstruction routines to move back again to four spin
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,7 +192,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(1));
@@ -191,7 +199,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesI(hspin(1));
@@ -199,7 +206,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(1));
@@ -207,7 +213,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(1));
@@ -221,7 +226,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)= hspin(1);
@@ -229,7 +233,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(1);
@@ -237,7 +240,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(1);
@@ -245,7 +247,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(1);
@@ -260,7 +261,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(0));
@@ -268,7 +268,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=     timesI(hspin(0));
@@ -276,7 +275,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(0));
@@ -284,7 +282,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(0));
@@ -298,7 +295,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=hspin(0);
@@ -306,7 +302,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(0);
@@ -314,7 +309,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(0);
@@ -322,7 +316,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(0);
@@ -336,7 +329,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
  fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
  fspin(2)=Zero();
@@ -344,7 +336,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=Zero();
  fspin(1)=Zero();
  fspin(2)=hspin(0)+hspin(0);
@@ -352,7 +343,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0)+hspin(0);
  fspin(1)+=hspin(1)+hspin(1);
 }
@@ -372,7 +362,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 //////////
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXp(hspin._internal[i],fspin._internal[i]);
  }
@@ -426,26 +415,21 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM
    }}
 }

-
-
 ////////
 // Xm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -455,19 +439,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -476,45 +457,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
-
 ////////
 // Yp
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -524,19 +497,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -545,66 +515,55 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // Ym
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -613,19 +572,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -638,66 +594,57 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


@@ -706,62 +653,53 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -774,41 +712,35 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -817,44 +749,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // Tm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -864,19 +789,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -885,44 +807,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // 5p
 ////////
-template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -931,19 +846,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatri

 template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -952,19 +864,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr

 template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -972,24 +881,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM
 }

 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1001,17 +904,17 @@ template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &
 // 5m
 ////////

-template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
  spProj5m(hspin._internal,fspin._internal);
 }
-template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
@@ -1021,40 +924,34 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatri

 template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1063,24 +960,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM


 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5m(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5m(hspin._internal[i][j],fspin._internal[i][j]);
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
    decltype(coalescedRead(y_v[ss+sp])) tmp;
-    spProj5m(tmp,y_v(ss+sp));
-    tmp = a*x_v(ss+s)+b*tmp;
+    spProj5m(tmp,y_v(ss+sp)); 
+   tmp = a*x_v(ss+s)+b*tmp;
    coalescedWrite(z_v[ss+s],tmp);
  });
 }
@@ -188,7 +188,6 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
-  Gamma G5(Gamma::Algebra::Gamma5);
  autoView( x_v, x, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
@@ -196,7 +195,13 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    uint64_t ss = sss*Ls;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
+      auto tmp = x_v(ss+s);
+      decltype(tmp) tmp_p;
+      decltype(tmp) tmp_m;
+      spProj5p(tmp_p,tmp);
+      spProj5m(tmp_m,tmp);
+      // Use of spProj5m, 5p captures the coarse space too
+      coalescedWrite(z_v[ss+sp],tmp_p - tmp_m);
    }
  });
 }
@@ -208,10 +213,20 @@ void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

-  Gamma G5(Gamma::Algebra::Gamma5);
-  z = G5 * x;
+  autoView( x_v, x, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
+  uint64_t nloop = grid->oSites();
+  accelerator_for(ss,nloop,vobj::Nsimd(),{
+    auto tmp = x_v(ss);
+    decltype(tmp) tmp_p;
+    decltype(tmp) tmp_m;
+    spProj5p(tmp_p,tmp);
+    spProj5m(tmp_m,tmp);
+    coalescedWrite(z_v[ss],tmp_p - tmp_m);
+  });
 }

+/*
 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
@@ -234,6 +249,7 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
    }
  });
 }
+*/

 NAMESPACE_END(Grid);

--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -41,11 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
-typedef struct { uint16_t x;} half;
-#endif
-typedef struct Half2_t { half x; half y; } Half2;
-
 #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )

 template<class pair>
@@ -130,14 +125,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
 }

 constexpr int NSIMD_RealH    = COALESCE_GRANULARITY / sizeof(half);
-constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
+constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
 constexpr int NSIMD_RealF    = COALESCE_GRANULARITY / sizeof(float);
 constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
 constexpr int NSIMD_RealD    = COALESCE_GRANULARITY / sizeof(double);
 constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
 constexpr int NSIMD_Integer  = COALESCE_GRANULARITY / sizeof(Integer);

-typedef GpuComplex<Half2  > GpuComplexH;
+typedef GpuComplex<half2  > GpuComplexH;
 typedef GpuComplex<float2 > GpuComplexF;
 typedef GpuComplex<double2> GpuComplexD;

@@ -152,9 +147,11 @@ typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;
 accelerator_inline float half2float(half h)
 {
  float f;
-#if defined(GRID_CUDA) || defined(GRID_HIP)
+#ifdef GRID_SIMT
  f = __half2float(h);
 #else 
+  //f = __half2float(h);
+  __half_raw hr(h);
  Grid_half hh; 
  hh.x = hr.x;
  f=  sfw_half_to_float(hh);
@@ -164,11 +161,13 @@ accelerator_inline float half2float(half h)
 accelerator_inline half float2half(float f)
 {
  half h;
-#if defined(GRID_CUDA) || defined(GRID_HIP)
+#ifdef GRID_SIMT
  h = __float2half(f);
 #else
  Grid_half hh = sfw_float_to_half(f);
-  h.x = hh.x;
+  __half_raw hr;  
+  hr.x = hh.x;
+  h = __half(hr);
 #endif
  return h;
 }
@@ -524,7 +523,7 @@ namespace Optimization {
    ////////////////////////////////////////////////////////////////////////////////////
    // Single / Half
    ////////////////////////////////////////////////////////////////////////////////////
-     static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
+    static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
      int N = GpuVectorCF::N;
      GpuVectorCH h;
      for(int i=0;i<N;i++) {
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -55,7 +55,6 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);

-
 	GPU_PROP_FMT(totalGlobalMem,"%lld");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
@@ -110,24 +109,20 @@ void acceleratorInit(void)
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}

-  printf("world_rank %d has %d devices\n",world_rank,nDevices);
-  size_t totalDeviceMem=0;
  for (int i = 0; i < nDevices; i++) {

 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
    hipGetDeviceProperties(&gpu_props[i], i);
-    hipDeviceProp_t prop; 
-    prop = gpu_props[i];
-    totalDeviceMem = prop.totalGlobalMem;
    if ( world_rank == 0) {
+      hipDeviceProp_t prop; 
+      prop = gpu_props[i];
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device Number    : %d\n", i);
      printf("AcceleratorHipInit: ========================\n");
      printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);

-      GPU_PROP_FMT(totalGlobalMem,"%lu");
      //      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
@@ -136,7 +131,6 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
-  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_IBM_SUMMIT
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -151,6 +151,9 @@ inline void *acceleratorAllocShared(size_t bytes)
    ptr = (void *) NULL;
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
+  //  size_t free,total;
+  //  cudaMemGetInfo(&free,&total);
+  //  std::cout << "Malloc managed "<<bytes<<" "<<free<<"/"<<total<<std::endl;
  return ptr;
 };
 inline void *acceleratorAllocDevice(size_t bytes)
@@ -161,6 +164,9 @@ inline void *acceleratorAllocDevice(size_t bytes)
    ptr = (void *) NULL;
    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
+  //  size_t free,total;
+  //  cudaMemGetInfo(&free,&total);
+  //  std::cout << "Malloc device "<<bytes<<" "<<free<<"/"<<total<<std::endl;
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
@@ -307,13 +313,17 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 inline void *acceleratorAllocShared(size_t bytes)
 {
+#if 0
  void *ptr=NULL;
  auto err = hipMallocManaged((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
+#else
+  return malloc(bytes);
+#endif
 };
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }

@@ -323,7 +333,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = hipMalloc((void **)&ptr,bytes);
  if( err != hipSuccess ) {
    ptr = (void *) NULL;
-    printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
+    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
  }
  return ptr;
 };
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@@ -14,62 +14,35 @@ std::string filestem(const int l)

 int main (int argc, char ** argv)
 {
+#ifdef HAVE_LIME
  Grid_init(&argc,&argv);

-  int64_t          threads = GridThread::GetThreads();
-  auto             mpi     = GridDefaultMpi();
-  std::vector<int> latt;
-
+  int64_t threads = GridThread::GetThreads();
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << "MPI partition " << mpi << std::endl;
-
  MSG << SEP << std::endl;
-  MSG << "Benchmark std write" << std::endl;
+  MSG << "Benchmark Lime write" << std::endl;
  MSG << SEP << std::endl;
  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    auto             mpi  = GridDefaultMpi();
+    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    MSG << "-- Local volume " << l << "^4" << std::endl;
-    writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>);
-  }
-
-  MSG << SEP << std::endl;
-  MSG << "Benchmark std read" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
-  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
-
-    MSG << "-- Local volume " << l << "^4" << std::endl;
-    readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>);
-  }
-
-#ifdef HAVE_LIME
-  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid C-Lime write" << std::endl;
-  MSG << SEP << std::endl;
-  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
-  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
-
-    MSG << "-- Local volume " << l << "^4" << std::endl;
+    std::cout << "-- Local volume " << l << "^4" << std::endl;
    writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
  }

-  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid C-Lime read" << std::endl;
+  MSG << "Benchmark Lime read" << std::endl;
  MSG << SEP << std::endl;
  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
  {
-    latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+    auto             mpi  = GridDefaultMpi();
+    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};

-    MSG << "-- Local volume " << l << "^4" << std::endl;
+    std::cout << "-- Local volume " << l << "^4" << std::endl;
    readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
  }
-#endif

  Grid_finalize();
-
+#endif
  return EXIT_SUCCESS;
 }
--- a/benchmarks/Benchmark_IO.hpp
+++ b/benchmarks/Benchmark_IO.hpp
@@ -14,140 +14,13 @@ using WriterFn = std::function<void(const std::string, Field &)> ;
 template <typename Field>
 using ReaderFn = std::function<void(Field &, const std::string)>;

-// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
-// 
-// template <typename Field>
-// void stdWrite(const std::string filestem, Field &vec)
-// {
-//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
-//   size_t        size;
-//   uint32_t      crc;
-//   GridStopWatch ioWatch, crcWatch;
-
-//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
-//   autoView(vec_v, vec, CpuRead);
-//   crcWatch.Start();
-//   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
-//   std::fwrite(&crc, sizeof(uint32_t), 1, file);
-//   crcWatch.Stop();
-//   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
-//   ioWatch.Start();
-//   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
-//   ioWatch.Stop();
-//   std::fclose(file);
-//   size *= vec.Grid()->ProcessorCount();
-//   MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
-//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-//       << " MB/s" << std::endl;
-//   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
-// }
-//
-// template <typename Field>
-// void stdRead(Field &vec, const std::string filestem)
-// {
-//   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-//   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
-//   size_t        size;
-//   uint32_t      crcRead, crcData;
-//   GridStopWatch ioWatch, crcWatch;
-
-//   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
-//   crcWatch.Start();
-//   std::fread(&crcRead, sizeof(uint32_t), 1, file);
-//   crcWatch.Stop();
-//   {
-//     autoView(vec_v, vec, CpuWrite);
-//     ioWatch.Start();
-//     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
-//     ioWatch.Stop();
-//     std::fclose(file);
-//   }
-//   {
-//     autoView(vec_v, vec, CpuRead);
-//     crcWatch.Start();
-//     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
-//     crcWatch.Stop();
-//   }
-//   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
-//   assert(crcData == crcRead);
-//   size *= vec.Grid()->ProcessorCount();
-//   MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
-//       << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-//       << " MB/s" << std::endl;
-//   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
-// }
-
-template <typename Field>
-void stdWrite(const std::string filestem, Field &vec)
-{
-  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-  std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
-  size_t        size, sizec;
-  uint32_t      crc;
-  GridStopWatch ioWatch, crcWatch;
-
-  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
-  sizec = size/sizeof(char); // just in case of...
-  autoView(vec_v, vec, CpuRead);
-  crcWatch.Start();
-  crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
-  file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char));
-  crcWatch.Stop();
-  MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
-  ioWatch.Start();
-  file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
-  file.flush();
-  ioWatch.Stop();
-  size *= vec.Grid()->ProcessorCount();
-  MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() 
-      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-      << " MB/s" << std::endl;
-  MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
-}
-
-template <typename Field>
-void stdRead(Field &vec, const std::string filestem)
-{
-  std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-  std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
-  size_t        size, sizec;
-  uint32_t      crcRead, crcData;
-  GridStopWatch ioWatch, crcWatch;
-
-  size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
-  sizec = size/sizeof(char); // just in case of...
-  crcWatch.Start();
-  file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char));
-  crcWatch.Stop();
-  {
-    autoView(vec_v, vec, CpuWrite);
-    ioWatch.Start();
-    file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec);
-    ioWatch.Stop();
-  }
-  {
-    autoView(vec_v, vec, CpuRead);
-    crcWatch.Start();
-    crcData = GridChecksum::crc32(vec_v.cpu_ptr, size);
-    crcWatch.Stop();
-  }
-  MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl;
-  assert(crcData == crcRead);
-  size *= vec.Grid()->ProcessorCount();
-  MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() 
-      << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) 
-      << " MB/s" << std::endl;
-  MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
-}
-
 template <typename Field>
 void limeWrite(const std::string filestem, Field &vec)
 {
  emptyUserRecord   record;
  ScidacWriter binWriter(vec.Grid()->IsBoss());

-  binWriter.open(filestem + ".lime.bin");
+  binWriter.open(filestem + ".bin");
  binWriter.writeScidacFieldRecord(vec, record);
  binWriter.close();
 }
@@ -158,7 +31,7 @@ void limeRead(Field &vec, const std::string filestem)
  emptyUserRecord   record;
  ScidacReader binReader;

-  binReader.open(filestem + ".lime.bin");
+  binReader.open(filestem + ".bin");
  binReader.readScidacFieldRecord(vec, record);
  binReader.close();
 }
--- a/benchmarks/Benchmark_IO_vs_dir.cc
+++ b/benchmarks/Benchmark_IO_vs_dir.cc
@@ -8,6 +8,7 @@ using namespace Grid;

 int main (int argc, char ** argv)
 {
+#ifdef HAVE_LIME
  std::vector<std::string> dir;
  unsigned int             Ls;
  bool                     rb;
@@ -33,71 +34,46 @@ int main (int argc, char ** argv)
  }
  Grid_init(&argc,&argv);

+
  int64_t threads = GridThread::GetThreads();
-  auto    mpi     = GridDefaultMpi();
-
  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
-  MSG << "MPI partition " << mpi << std::endl;
-
  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid std write" << std::endl;
+  MSG << "Benchmark double precision Lime write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
-                                   stdWrite<LatticeFermion>, Ls, rb);
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
  }
+
  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid std read" << std::endl;
+  MSG << "Benchmark double precision Lime read" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
-                                  stdRead<LatticeFermion>, Ls, rb);
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
  }

-#ifdef HAVE_LIME
  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid C-Lime write" << std::endl;
+  MSG << "Benchmark single precision Lime write" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
-                                   limeWrite<LatticeFermion>, Ls, rb);
+    writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
  }
+
  MSG << SEP << std::endl;
-  MSG << "Benchmark Grid C-Lime read" << std::endl;
+  MSG << "Benchmark single precision Lime read" << std::endl;
  MSG << SEP << std::endl;
  for (auto &d: dir)
  {
    MSG << "-- Directory " << d << std::endl;
-    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", 
-                                  limeRead<LatticeFermion>, Ls, rb);
+    readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
  }
-#endif
-
-  // MSG << SEP << std::endl;
-  // MSG << "Benchmark single precision Lime write" << std::endl;
-  // MSG << SEP << std::endl;
-  // for (auto &d: dir)
-  // {
-  //   MSG << "-- Directory " << d << std::endl;
-  //   writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
-  // }
-
-  // MSG << SEP << std::endl;
-  // MSG << "Benchmark single precision Lime read" << std::endl;
-  // MSG << SEP << std::endl;
-  // for (auto &d: dir)
-  // {
-  //   MSG << "-- Directory " << d << std::endl;
-  //   readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
-  // }

  Grid_finalize();
-
+#endif
  return EXIT_SUCCESS;
 }
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -36,12 +36,12 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-#define LMAX (40)
+#define LMAX (48)
 #define LMIN (8)
 #define LADD (8)

-  int64_t Nwarm=10;
-  int64_t Nloop=100;
+  int64_t Nwarm=50;
+  int64_t Nloop=500;

  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
@@ -118,41 +118,6 @@ int main (int argc, char ** argv)

    }

-
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z=z+ x*y"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
-  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-
-  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-
-      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); random(pRNG,y);
-
-      for(int64_t i=0;i<Nwarm;i++){
-	z=z+x*y;
-      }
-      double start=usecond();
-      for(int64_t i=0;i<Nloop;i++){
-	z=z+x*y;
-      }
-      double stop=usecond();
-      double time = (stop-start)/Nloop*1000.0;
-      
-      double bytes=4*vol*Nc*Nc*sizeof(Complex);
-      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
-
-    }
-
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -178,6 +143,7 @@ int main (int argc, char ** argv)
      double start=usecond();
      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
+	//	mac(z,x,y);
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000.0;
@@ -225,7 +191,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
@@ -250,15 +216,16 @@ int main (int argc, char ** argv)
 	
 	
 	    double bytes=3*vol*Nc*Nc*sizeof(Complex);
+	    double ncbytes=5*vol*Nc*Nc*sizeof(Complex);
 	    double flops=Nc*Nc*(6+8+8)*vol;
-	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<ncbytes/time<<"\t\t" << flops/time<<std::endl;
      }
  }
 #if 1
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
@@ -292,10 +259,11 @@ int main (int argc, char ** argv)
 	tmult  = tmult /Nloop;
 	
 	double bytes=3*vol*Nc*Nc*sizeof(Complex);
+	double ncbytes=5*vol*Nc*Nc*sizeof(Complex);
 	double flops=Nc*Nc*(6+8+8)*vol;
 	std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
 	time = time * 1000; // convert to NS for GB/s
-	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" <<ncbytes/time<<"\t\t" << flops/time<<std::endl;
      }
    }
 #endif
--- a/benchmarks/Benchmark_su3_gpu.cc
+++ b/benchmarks/Benchmark_su3_gpu.cc
@@ -187,8 +187,7 @@ int main (int argc, char ** argv)
 	  auto xx = coalescedRead(x_v[ss]);
 	  auto yy = coalescedRead(y_v[ss]);
 	  auto zz = coalescedRead(z_v[ss]);
-	  //zz = zz+xx*yy;
-	  mac(&zz,&xx,&yy);
+	  zz = zz+xx*yy;
 	  coalescedWrite(z_v[ss],zz);
        });
      }
--- a/benchmarks/benchmark-io-csv.sh
+++ b/benchmarks/benchmark-io-csv.sh
@@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-
-awkscript='
-BEGIN{
-  i = 0;
-  print "local L,std read (MB/s),std write (MB/s),Grid Lime read (MB/s),Grid Lime write (MB/s)"
-}
-
-/Benchmark std write/{
-  i    = 0; 
-  mode = "stdWrite";
-} 
-
-/Benchmark std read/{
-  i    = 0; 
-  mode = "stdRead"
-} 
-
-/Benchmark Grid C-Lime write/{
-  i    = 0; 
-  mode = "gridWrite";
-} 
-
-/Benchmark Grid C-Lime read/{
-  i    = 0; 
-  mode = "gridRead";
-}
-
-/Local volume/{
-  match($0, "[0-9]+\\^4");
-  l[i] = substr($0, RSTART, RLENGTH-2);
-}
-
-/MB\/s/{
-  match($0, "[0-9.eE]+ MB/s");
-  p = substr($0, RSTART, RLENGTH-5);
-  if (mode == "stdWrite")
-  {
-    sw[i] = p;
-  }
-  else if (mode == "stdRead")
-  {
-    sr[i] = p;
-  }
-  else if (mode == "gridWrite")
-  {
-    gw[i] = p;
-  }
-  else if (mode == "gridRead")
-  {
-    gr[i] = p;
-  }
-  i++;
-}
-
-END{
-  s = 0
-  for (a in l)
-  {
-    s++;
-  }
-  for (j = 0; j < s; j++)
-  {
-    printf("%s,%s,%s,%s,%s\n", l[j], sr[j], sw[j], gr[j], gw[j]);
-  }
-  printf("\n");
-}
-'
-
-if (( $# != 1 )); then
-    echo "usage: `basename $0` <log file>" 1>&2
-    exit 1
-fi
-LOG=$1
-
-awk "${awkscript}" ${LOG} 
--- a/configure.ac
+++ b/configure.ac
@@ -330,18 +330,12 @@ case ${CXXTEST} in
    fi
    ;;
  hipcc)
-#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
-    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
+    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    CXXLD=${CXX}
    if test $ac_openmp = yes; then
       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
    fi
    ;;
-  dpcpp)
-    LDFLAGS="$LDFLAGS"
-    CXXFLAGS="$CXXFLAGS"
-    CXXLD=${CXX}
-    ;;
  *)
    CXXLD=${CXX}
    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu

 This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:

-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug
+    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none

 #### 2. `Release`

 Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):

-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release
+    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none

 #### 3. `MPIDebug`

 Debug configuration with MPI:

-    ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug
+    ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx

 ### 5.3 Build Grid

--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -29,7 +29,91 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Grid.h>

 using namespace Grid;
- ;
+
+void MomentumSpacePropagatorTest(RealD mass,RealD M5, LatticePropagator &prop)
+{
+  // what type LatticeComplex 
+  GridBase *_grid = prop.Grid();
+  
+  typedef LatticeFermion FermionField;
+  typedef LatticePropagator PropagatorField;
+  typedef typename FermionField::vector_type vector_type;
+  typedef typename FermionField::scalar_type ScalComplex;
+  typedef iSinglet<ScalComplex> Tcomplex;
+  typedef Lattice<iSinglet<vector_type> > LatComplex;
+  
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+  Coordinate latt_size   = _grid->_fdimensions;
+  
+  PropagatorField   num  (_grid); num  = Zero();
+
+  LatComplex    sk(_grid);  sk = Zero();
+  LatComplex    sk2(_grid); sk2= Zero();
+  LatComplex    W(_grid); W= Zero();
+  LatComplex    a(_grid); a= Zero();
+  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
+  LatComplex denom(_grid); denom= Zero();
+  LatComplex cosha(_grid); 
+  LatComplex kmu(_grid); 
+  LatComplex Wea(_grid); 
+  LatComplex Wema(_grid); 
+
+  ScalComplex ci(0.0,1.0);
+  SpinColourMatrixD identity = ComplexD(1.0);
+
+  for(int mu=0;mu<Nd;mu++) {
+    
+    LatticeCoordinate(kmu,mu);
+    
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    
+    kmu = TwoPiL * kmu;
+    //    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
+    
+    sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
+    sk  = sk  +     sin(kmu)    *sin(kmu); 
+    
+    num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*identity);
+    
+  }
+  
+  W = one - M5 + sk2;
+
+  ////////////////////////////////////////////
+  // Cosh alpha -> alpha
+  ////////////////////////////////////////////
+  cosha =  (one + W*W + sk) / (abs(W)*2.0);
+
+  // FIXME Need a Lattice acosh
+  {
+  autoView(cosha_v,cosha,CpuRead);
+  autoView(a_v,a,CpuWrite);
+  for(int idx=0;idx<_grid->lSites();idx++){
+    Coordinate lcoor(Nd);
+    Tcomplex cc;
+    //    RealD sgn;
+    _grid->LocalIndexToLocalCoor(idx,lcoor);
+    peekLocalSite(cc,cosha_v,lcoor);
+    assert((double)real(cc)>=1.0);
+    assert(fabs((double)imag(cc))<=1.0e-15);
+    cc = ScalComplex(::acosh(real(cc)),0.0);
+    pokeLocalSite(cc,a_v,lcoor);
+  }}
+  
+  Wea = ( exp( a) * abs(W)  );
+  Wema= ( exp(-a) * abs(W)  );
+  
+  num   = num + ( one - Wema ) * mass * identity;
+  denom= ( Wea - one ) + mass*mass * (one - Wema); 
+  prop = num/denom;
+}
+

 int main (int argc, char ** argv)
 {
@@ -307,6 +391,17 @@ int main (int argc, char ** argv)
    RealD M5  =0.8;
    DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5);

+    /////////////////// Test code for (1-m)^2 ///////////////
+    LatticePropagatorD prop1(&GRID);
+    LatticePropagatorD prop2(&GRID);
+    LatticeComplexD ratio(&GRID);
+    MomentumSpacePropagatorTest(0.0,M5,prop1);
+    MomentumSpacePropagatorTest(0.3,M5,prop2);
+    ratio=localNorm2(prop2);
+    ratio=ratio/localNorm2(prop1);
+    std::cout << ratio;
+    /////////////////// Test code for (1-m)^2 factor ///////////////
+
    // Momentum space prop
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
    bool fiveD = false; //calculate 4d free propagator
--- a/tests/solver/Test_dwf_multigrid.cc
+++ b/tests/solver/Test_dwf_multigrid.cc
@@ -565,8 +565,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << " Running Multigrid                                "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;

-  BiCGSTAB<CoarseVector>    CoarseMgridBiCGSTAB(0.01,1000);     
-  BiCGSTAB<LatticeFermion>  FineMgridBiCGSTAB(0.0,24);
+  BiCGSTAB<CoarseVector>    CoarseMgridBiCGSTAB(0.01,1000,false);     
+  BiCGSTAB<LatticeFermion>  FineMgridBiCGSTAB(0.0,24,false);
  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  ZeroGuesser<LatticeFermion> FineZeroGuesser;

@@ -590,5 +590,5 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Done "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  Grid_finalize();
-  
+
 }
--- a/tests/solver/Test_hw_multigrid.cc
+++ b/tests/solver/Test_hw_multigrid.cc
Author	SHA1	Message	Date
Peter Boyle	3064c9a6e2	Improve the matching of stencil coarsening	2020-09-08 15:36:58 -04:00
Peter Boyle	729882827c	Improve the coarse matrix calc	2020-09-08 15:36:33 -04:00
Peter Boyle	baa668d3ac	Merge branch 'develop' into feature/hw-multigrid Conflicts: Grid/allocator/MemoryManager.h	2020-09-03 22:16:50 -04:00
Peter Boyle	3c82d16ed8	4D multigrid	2020-09-03 22:11:17 -04:00
Peter Boyle	5c8c0c2d7c	BiCG	2020-09-03 22:10:40 -04:00
Peter Boyle	e5a100846c	Useful tthing to preserve	2020-09-03 22:09:57 -04:00
Peter Boyle	a74e2dc12e	Printing mem info	2020-09-03 22:08:44 -04:00
Peter Boyle	595f512a6e	G5 for coarse space too	2020-09-03 22:07:40 -04:00
Peter Boyle	a6499b22ff	Stats printing	2020-09-03 22:00:46 -04:00
Peter Boyle	b4e42a59c6	Stats traacking improvement	2020-09-03 22:00:14 -04:00
Peter Boyle	8c913e0edd	Clearer UVM ttreatment	2020-09-03 21:59:05 -04:00
Peter Boyle	fd3f93d8d3	Zero changes	2020-09-03 21:57:11 -04:00
Peter Boyle	e9543cdacd	Time deflation	2020-09-03 21:56:02 -04:00
Peter Boyle	98f7b3d298	Pcg	2020-09-03 21:55:05 -04:00
Peter Boyle	b7b164ea24	Test operator and deebug code	2020-09-03 21:54:20 -04:00
Peter Boyle	77124d99d5	Merge branch 'develop' into feature/hw-multigrid	2020-09-03 21:52:04 -04:00
Peter Boyle	e1327e7ea0	Optional bounds check debug code	2020-07-16 16:57:46 -04:00
Peter Boyle	569f78c2cf	Stenccil improvement	2020-07-16 16:57:13 -04:00
Peter Boyle	488c79d5a1	Bound improvement	2020-07-15 19:58:08 -04:00
Peter Boyle	dc6b0f20b2	Fixed array bounds	2020-07-02 12:20:20 -04:00
Peter Boyle	c0badc3e16	Summit bounce back to git	2020-07-02 10:48:39 -04:00
Peter Boyle	58f6529b55	Slowly piecing together	2020-06-30 16:42:03 -04:00
Peter Boyle	e3f056dfbb	Hw multigrid operator	2020-06-30 16:10:16 -04:00
Peter Boyle	da0ffa7a79	Two spin update defer commit to repository	2020-06-30 16:09:48 -04:00
Peter Boyle	fcc7640b9c	Detect a coarsened vector	2020-06-30 14:17:45 -04:00
Peter Boyle	0cbe2859e0	Making progress on Hw based 5d coarse matrix	2020-06-30 14:17:20 -04:00