Merge pull request #23 from paboyle/develop

Sync
2025-09-19 09:41:05 +01:00 · 2021-03-01 12:33:51 +01:00
parent 4705aa541d 2e61556389
commit 9c9566b9c9
51 changed files with 6081 additions and 1091 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -775,7 +775,26 @@ public:
    for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
  }
  
- CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:
+  CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	:
+    _grid(&CoarseGrid),
+    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
+    geom(CoarseGrid._ndimension),
+    hermitian(hermitian_),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    A(geom.npoint,&CoarseGrid),
+    Aeven(geom.npoint,_cbgrid),
+    Aodd(geom.npoint,_cbgrid),
+    AselfInv(&CoarseGrid),
+    AselfInvEven(_cbgrid),
+    AselfInvOdd(_cbgrid),
+    dag_factor(nbasis*nbasis)
+  {
+    fillFactor();
+  };
+
+  CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) 	:

    _grid(&CoarseGrid),
    _cbgrid(&CoarseRBGrid),
@@ -817,6 +836,8 @@ public:
    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
    typedef typename Fobj::scalar_type scalar_type;

+    std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
+
    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);

@@ -847,11 +868,13 @@ public:

    CoarseScalar InnerProd(Grid()); 

+    std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
    // Orthogonalise the subblocks over the basis
    blockOrthogonalise(InnerProd,Subspace.subspace);

    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
+    std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
    int self_stencil=-1;
    for(int p=0;p<geom.npoint;p++)
    { 
@@ -890,7 +913,7 @@ public:

      phi=Subspace.subspace[i];

-      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
      linop.OpDirAll(phi,Mphi_p);
      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);

@@ -919,6 +942,18 @@ public:
 	    autoView( A_self  , A[self_stencil], AcceleratorWrite);

 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
+	    if ( hermitian && (disp==-1) ) {
+	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
+		int dirp   = geom.directions[pp];
+		int dispp  = geom.displacements[pp];
+		if ( (dirp==dir) && (dispp==1) ){
+		  auto sft = conjugate(Cshift(oZProj,dir,1));
+		  autoView( sft_v    ,  sft  , AcceleratorWrite);
+		  autoView( A_pp     ,  A[pp], AcceleratorWrite);
+		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
+		}
+	      }
+	    }

 	  }
 	}
@@ -957,33 +992,12 @@ public:
    }
    if(hermitian) {
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
-      ForceHermitian();
    }

    InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
    FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
  }

-  void ForceHermitian(void) {
-    CoarseMatrix Diff  (Grid());
-    for(int p=0;p<geom.npoint;p++){
-      int dir   = geom.directions[p];
-      int disp  = geom.displacements[p];
-      if(disp==-1) {
-	// Find the opposite link
-	for(int pp=0;pp<geom.npoint;pp++){
-	  int dirp   = geom.directions[pp];
-	  int dispp  = geom.displacements[pp];
-	  if ( (dirp==dir) && (dispp==1) ){
-	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
-	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
-	    A[pp] = adj(Cshift(A[p],dir,1));
-	  }
-	}
-      }
-    }
-  }
-
  void InvertSelfStencilLink() {
    std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
    int localVolume = Grid()->lSites();
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -1,4 +1,3 @@
-
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -108,6 +107,8 @@ public:
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
+  void GlobalMax(RealD &);
+  void GlobalMax(RealF &);
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
+void CartesianCommunicator::GlobalMax(float &f)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalMax(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -67,6 +67,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)

 CartesianCommunicator::~CartesianCommunicator(){}

+void CartesianCommunicator::GlobalMax(float &){}
+void CartesianCommunicator::GlobalMax(double &){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -96,8 +96,34 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  ssobj ret = ssum;
  return ret;
 }
+/*
+Threaded max, don't use for now
+template<class Double>
+inline Double max(const Double *arg, Integer osites)
+{
+  //  const int Nsimd = vobj::Nsimd();
+  const int nthread = GridThread::GetThreads();

-
+  std::vector<Double> maxarray(nthread);
+  
+  thread_for(thr,nthread, {
+    int nwork, mywork, myoff;
+    nwork = osites;
+    GridThread::GetWork(nwork,thr,mywork,myoff);
+    Double max=arg[0];
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      if( arg[ss] > max ) max = arg[ss];
+    }
+    maxarray[thr]=max;
+  });
+  
+  Double tmax=maxarray[0];
+  for(int i=0;i<nthread;i++){
+    if (maxarray[i]>tmax) tmax = maxarray[i];
+  } 
+  return tmax;
+}
+*/
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
@@ -141,6 +167,32 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

+//The global maximum of the site norm2
+template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
+{
+  typedef typename vobj::tensor_reduced vscalar;  //iScalar<iScalar<.... <vPODtype> > >
+  typedef typename vscalar::scalar_object  scalar;   //iScalar<iScalar<.... <PODtype> > >
+
+  Lattice<vscalar> inner = localNorm2(arg);
+
+  auto grid = arg.Grid();
+
+  RealD max;
+  for(int l=0;l<grid->lSites();l++){
+    Coordinate coor;
+    scalar val;
+    RealD r;
+    grid->LocalIndexToLocalCoor(l,coor);
+    peekLocalSite(val,inner,coor);
+    r=real(TensorRemove(val));
+    if( (l==0) || (r>max)){
+      max=r;
+    }
+  }
+  grid->GlobalMax(max);
+  return max;
+}
+
 // Double inner product
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -67,9 +67,14 @@ public:
  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 #endif

+#if 1
+  //  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
+  accelerator_inline vobj       & operator[](size_t i) const { return this->_odata[i]; };
+#else
  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
-
+#endif
+  
  accelerator_inline uint64_t begin(void) const { return 0;};
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
 ////////////////////////////////////////////////////////////
 // Helper to fill out metadata
 ////////////////////////////////////////////////////////////
- template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
 					  scidacFile   & _scidacFile) 
@@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
+  template <class stats = PeriodicGaugeStatistics>
+  void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description) 
  {
    GridBase * grid = Umu.Grid();
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef Lattice<vLorentzColourMatrixD> GaugeField;
+    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    ////////////////////////////////////////
@@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter {

    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);

+    stats Stats;
+    Stats(Umu,header);
+    
    std::string format = header.floating_point;
    header.ensemble_id    = description;
    header.ensemble_label = description;
@@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader {
  // Else use ILDG MetaData object if present.
  // Else use SciDAC MetaData object if present.
  ////////////////////////////////////////////////////////////////
-  template <class vsimd>
-  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
+  template <class stats = PeriodicGaugeStatistics>
+  void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {

-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+    typedef Lattice<vLorentzColourMatrixD > GaugeField;
    typedef typename GaugeField::vector_object  vobj;
    typedef typename vobj::scalar_object sobj;

@@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader {

    if ( found_FieldMetaData || found_usqcdInfo ) {
      FieldMetaData checker;
-      GaugeStatistics(Umu,checker);
+      stats Stats;
+      Stats(Umu,checker);
      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
  GridMetaData(grid,header); 
  MachineCharacteristics(header);
 }
-inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
+template<class Impl>
+class GaugeStatistics
 {
-  // How to convert data precision etc...
-  header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
-  header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
-}
-inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
-{
-  // How to convert data precision etc...
-  header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
-  header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
-}
-template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
-{
-   
-  GridBase *grid = field.Grid();
-  std::string format = getFormatString<vLorentzColourMatrixF>();
-  header.floating_point = format;
-  header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
-  GridMetaData(grid,header); 
-  GaugeStatistics(field,header);
-  MachineCharacteristics(header);
-}
+public:
+  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
+  {
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+  }
+};
+typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
+typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
 template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
 {
  GridBase *grid = field.Grid();
@@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
  header.floating_point = format;
  header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
  GridMetaData(grid,header); 
-  GaugeStatistics(field,header);
  MachineCharacteristics(header);
 }

--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -40,6 +40,8 @@ using namespace Grid;
 class NerscIO : public BinaryIO { 
 public:

+  typedef Lattice<vLorentzColourMatrixD> GaugeField;
+
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -129,12 +131,12 @@ public:
  // Now the meat: the object readers
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

-  template<class vsimd>
-  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void readConfiguration(GaugeField &Umu,
 				       FieldMetaData& header,
-				       std::string file)
+				       std::string file,
+				       GaugeStats GaugeStatisticsCalculator=GaugeStats())
  {
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;

    GridBase *grid = Umu.Grid();
    uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -153,23 +155,23 @@ public:
    // munger is a function of <floating point, Real, data_type>
    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
      if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
      if ( ieee64 || ieee64big ) {
-	BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
@@ -177,7 +179,7 @@ public:
      assert(0);
    }

-    GaugeStatistics(Umu,clone);
+    GaugeStats Stats; Stats(Umu,clone);

    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	     <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -203,15 +205,13 @@ public:
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }

-  template<class vsimd>
-  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32)
  {
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
-
-    typedef iLorentzColourMatrix<vsimd> vobj;
+    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    FieldMetaData header;
@@ -229,7 +229,7 @@ public:

    GridMetaData(grid,header);
    assert(header.nd==4);
-    GaugeStatistics(Umu,header);
+    GaugeStats Stats; Stats(Umu,header);
    MachineCharacteristics(header);

 	uint64_t offset;
@@ -238,19 +238,19 @@ public:
    header.floating_point = std::string("IEEE64BIG");
    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
    GaugeSimpleUnmunger<fobj3D,sobj> munge;
-	if ( grid->IsBoss() ) { 
-	  truncate(file);
-    offset = writeHeader(header,file);
-	}
-	grid->Broadcast(0,(void *)&offset,sizeof(offset));
+    if ( grid->IsBoss() ) { 
+      truncate(file);
+      offset = writeHeader(header,file);
+    }
+    grid->Broadcast(0,(void *)&offset,sizeof(offset));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 					      nersc_csum,scidac_csuma,scidac_csumb);
    header.checksum = nersc_csum;
-	if ( grid->IsBoss() ) { 
-    writeHeader(header,file);
-	}
+    if ( grid->IsBoss() ) { 
+      writeHeader(header,file);
+    }

    std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 	     <<std::hex<<header.checksum
--- a/Grid/parallelIO/OpenQcdIO.h
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -154,7 +154,7 @@ public:
    grid->Barrier(); timer.Stop();
    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;

-    GaugeStatistics(Umu, clone);
+    PeriodicGaugeStatistics Stats; Stats(Umu, clone);

    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

--- a/Grid/parallelIO/OpenQcdIOChromaReference.h
+++ b/Grid/parallelIO/OpenQcdIOChromaReference.h
@@ -208,7 +208,7 @@ public:

    FieldMetaData clone(header);

-    GaugeStatistics(Umu, clone);
+    PeriodicGaugeStatistics Stats; Stats(Umu, clone);

    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -80,6 +80,13 @@ template<typename T> struct isSpinor {
 template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
 template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;

+const int CoarseIndex = 4;
+template<typename T> struct isCoarsened {
+   static constexpr bool value = (CoarseIndex<=T::TensorLevel);
+};
+template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
+template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -85,7 +85,7 @@ class MADWF
      maxiter     =_maxiter;
    };
   
-  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
+  void operator() (const FermionFieldo &src,FermionFieldo &sol5)
  {
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
@@ -114,8 +114,16 @@ class MADWF
    ///////////////////////////////////////
    //Import source, include Dminus factors
    ///////////////////////////////////////
-    Mato.ImportPhysicalFermionSource(src4,b); 
-    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
+    GridBase *src_grid = src.Grid();
+
+    assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
+
+    if ( src_grid == Mato.GaugeGrid() ) {
+      Mato.ImportPhysicalFermionSource(src,b);
+    } else {
+      b=src;
+    }
+    std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;

    defect = b;
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -106,11 +106,15 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
  {
+    const int Nsimd = SiteHalfSpinor::Nsimd();
    autoView( out_v, out, AcceleratorWrite);
    autoView( phi_v, phi, AcceleratorRead);
    autoView( Umu_v, Umu, AcceleratorRead);
-    accelerator_for(sss,out.Grid()->oSites(),1,{
-	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
+    typedef decltype(coalescedRead(out_v[0]))   calcSpinor;
+    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
+	calcSpinor tmp;
+	multLink(tmp,Umu_v[sss],phi_v(sss),mu);
+	coalescedWrite(out_v[sss],tmp);
    });
  }
 					   
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -397,6 +397,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@@ -408,6 +409,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@@ -420,6 +422,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

--- a/Grid/qcd/action/gauge/Gauge.cc
+++ b/Grid/qcd/action/gauge/Gauge.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/gauge/Gauge.cc
+
+Copyright (C) 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+std::vector<int> ConjugateGaugeImplBase::_conjDirs;
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -59,14 +59,14 @@ public:
  }
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    return Cshift(adj(Link), mu, -1);
+    return PeriodicBC::CovShiftIdentityBackward(Link, mu);
  }
  static inline GaugeLinkField
  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-    return Link;
+    return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-    return Cshift(Link, mu, 1);
+    return PeriodicBC::ShiftStaple(Link,mu);
  }

  static inline bool isPeriodicGaugeField(void) { return true; }
@@ -74,7 +74,13 @@ public:

 // Composition with smeared link, bc's etc.. probably need multiple inheritance
 // Variable precision "S" and variable Nc
-template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
+class ConjugateGaugeImplBase {
+protected:
+  static std::vector<int> _conjDirs;
+};
+
+  template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase {
+private:
 public:
  INHERIT_GIMPL_TYPES(GimplTypes);

@@ -84,47 +90,56 @@ public:
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  template <class covariant>
  static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
-                                            const Lattice<covariant> &field) {
-    return ConjugateBC::CovShiftForward(Link, mu, field);
+                                            const Lattice<covariant> &field)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftForward(Link, mu, field);
+    else
+      return PeriodicBC::CovShiftForward(Link, mu, field);
  }

  template <class covariant>
  static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
-                                             const Lattice<covariant> &field) {
-    return ConjugateBC::CovShiftBackward(Link, mu, field);
+                                             const Lattice<covariant> &field)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftBackward(Link, mu, field);
+    else 
+      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }

  static inline GaugeLinkField
-  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-    GridBase *grid = Link.Grid();
-    int Lmu = grid->GlobalDimensions()[mu] - 1;
-
-    Lattice<iScalar<vInteger>> coor(grid);
-    LatticeCoordinate(coor, mu);
-
-    GaugeLinkField tmp(grid);
-    tmp = adj(Link);
-    tmp = where(coor == Lmu, conjugate(tmp), tmp);
-    return Cshift(tmp, mu, -1); // moves towards positive mu
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftIdentityBackward(Link, mu);
+    else 
+      return PeriodicBC::CovShiftIdentityBackward(Link, mu);
  }
  static inline GaugeLinkField
-  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-    return Link;
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CovShiftIdentityForward(Link,mu);
+    else
+      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }

-  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-    GridBase *grid = Link.Grid();
-    int Lmu = grid->GlobalDimensions()[mu] - 1;
-
-    Lattice<iScalar<vInteger>> coor(grid);
-    LatticeCoordinate(coor, mu);
-
-    GaugeLinkField tmp(grid);
-    tmp = Cshift(Link, mu, 1);
-    tmp = where(coor == Lmu, conjugate(tmp), tmp);
-    return tmp;
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
+  {
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::ShiftStaple(Link,mu);
+    else     
+      return PeriodicBC::ShiftStaple(Link,mu);
  }

+  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
+  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
 };

--- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@@ -74,7 +74,7 @@ public:
      conf_file = os.str();
    }
  } 
-
+  virtual ~BaseHmcCheckpointer(){};
  void check_filename(const std::string &filename){
    std::ifstream f(filename.c_str());
    if(!f.good()){
@@ -82,7 +82,6 @@ public:
      abort();
    };
  }
-
  virtual void initialize(const CheckpointerParameters &Params) = 0;

  virtual void CheckpointRestore(int traj, typename Impl::Field &U,
--- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -45,6 +45,7 @@ private:

 public:
  INHERIT_GIMPL_TYPES(Implementation);
+  typedef GaugeStatistics<Implementation> GaugeStats;

  ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }

@@ -78,7 +79,7 @@ public:
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
      IldgWriter _IldgWriter(grid->IsBoss());
      _IldgWriter.open(config);
-      _IldgWriter.writeConfiguration(U, traj, config, config);
+      _IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
      _IldgWriter.close();

      std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@@ -105,7 +106,7 @@ public:
    FieldMetaData header;
    IldgReader _IldgReader;
    _IldgReader.open(config);
-    _IldgReader.readConfiguration(U,header);  // format from the header
+    _IldgReader.readConfiguration<GaugeStats>(U,header);  // format from the header
    _IldgReader.close();

    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
--- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
@@ -43,7 +43,8 @@ private:

 public:
  INHERIT_GIMPL_TYPES(Gimpl);  // only for gauge configurations
-
+  typedef GaugeStatistics<Gimpl> GaugeStats;
+  
  NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }

  void initialize(const CheckpointerParameters &Params_) {
@@ -60,7 +61,7 @@ public:
      int precision32 = 1;
      int tworow = 0;
      NerscIO::writeRNGState(sRNG, pRNG, rng);
-      NerscIO::writeConfiguration(U, config, tworow, precision32);
+      NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
    }
  };

@@ -74,7 +75,7 @@ public:

    FieldMetaData header;
    NerscIO::readRNGState(sRNG, pRNG, header, rng);
-    NerscIO::readConfiguration(U, header, config);
+    NerscIO::readConfiguration<GaugeStats>(U, header, config);
  };
 };

--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -33,6 +33,7 @@ directory
 #define INTEGRATOR_INCLUDED

 #include <memory>
+#include "MomentumFilter.h"

 NAMESPACE_BEGIN(Grid);

@@ -78,8 +79,19 @@ protected:
  RepresentationPolicy Representations;
  IntegratorParameters Params;

+  //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC
+  //It is applied whenever the momentum is updated / refreshed
+  //The default filter does nothing
+  MomentumFilterBase<MomentaField> const* MomFilter;
+
  const ActionSet<Field, RepresentationPolicy> as;

+  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
+  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
+    static MomentumFilterNone<MomentaField> filter;
+    return &filter;
+  }
+
  void update_P(Field& U, int level, double ep) 
  {
    t_P[level] += ep;
@@ -135,6 +147,8 @@ protected:

    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
+
+    MomFilter->applyFilter(Mom);
  }

  void update_U(Field& U, double ep) 
@@ -174,11 +188,23 @@ public:
    t_P.resize(levels, 0.0);
    t_U = 0.0;
    // initialization of smearer delegated outside of Integrator
+
+    //Default the momentum filter to "do-nothing"
+    MomFilter = getDefaultMomFilter();
  };

  virtual ~Integrator() {}

  virtual std::string integrator_name() = 0;
+  
+  //Set the momentum filter allowing for manipulation of the conjugate momentum
+  void setMomentumFilter(const MomentumFilterBase<MomentaField> &filter){
+    MomFilter = &filter;
+  }
+
+  //Access the conjugate momentum
+  const MomentaField & getMomentum() const{ return P; }
+  

  void print_parameters()
  {
@@ -249,6 +275,8 @@ public:
      // Refresh the higher representation actions
      as[level].apply(refresh_hireps, Representations, pRNG);
    }
+
+    MomFilter->applyFilter(P);
  }

  // to be used by the actionlevel class to iterate
--- a/Grid/qcd/hmc/integrators/MomentumFilter.h
+++ b/Grid/qcd/hmc/integrators/MomentumFilter.h
@@ -0,0 +1,94 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h
+
+Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//--------------------------------------------------------------------
+#ifndef MOMENTUM_FILTER
+#define MOMENTUM_FILTER
+
+NAMESPACE_BEGIN(Grid);
+
+//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh
+
+template<typename MomentaField>
+struct MomentumFilterBase{
+  virtual void applyFilter(MomentaField &P) const;
+};
+
+//Do nothing
+template<typename MomentaField>
+struct MomentumFilterNone: public MomentumFilterBase<MomentaField>{
+  void applyFilter(MomentaField &P) const override{}
+};
+
+//Multiply each site/direction by a Lorentz vector complex number field
+//Can be used to implement a mask, zeroing out sites
+template<typename MomentaField>
+struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
+  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
+  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
+  typedef iVector<iScalar<iScalar<vector_type> >, Nd > LorentzScalarType; //complex phase for each site/direction
+  typedef Lattice<LorentzScalarType> LatticeLorentzScalarType;
+  
+  LatticeLorentzScalarType phase;
+ 
+  MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){}
+
+  //Default to uniform field of (1,0)
+  MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){
+    LorentzScalarType one;
+    for(int mu=0;mu<Nd;mu++)
+      one(mu)()() = scalar_type(1.);
+    
+    phase = one;
+  }
+
+  void applyFilter(MomentaField &P) const override{
+    conformable(P,phase);
+    autoView( P_v , P, AcceleratorWrite);
+    autoView( phase_v , phase, AcceleratorRead);
+
+    accelerator_for(ss,P_v.size(),MomentaField::vector_type::Nsimd(),{
+    	auto site_mom = P_v(ss);
+    	auto site_phase = phase_v(ss);
+	for(int mu=0;mu<Nd;mu++)
+	  site_mom(mu) = site_mom(mu) * site_phase(mu);
+    	coalescedWrite(P_v[ss], site_mom);
+      });
+    
+  }
+
+
+};
+
+
+
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/modules/Modules.h
+++ b/Grid/qcd/modules/Modules.h
@@ -99,7 +99,7 @@ public:
  virtual Prod* getPtr() = 0;

  // add a getReference? 
-  
+  virtual ~HMCModuleBase(){};
  virtual void print_parameters(){};  // default to nothing
 };

--- a/Grid/qcd/spin/TwoSpinor.h
+++ b/Grid/qcd/spin/TwoSpinor.h
@@ -128,7 +128,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0)-fspin(2);
  hspin(1)=fspin(1)-fspin(3);
 }
@@ -138,40 +137,50 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 *  0 0 -1  0
 *  0 0  0 -1
 */
-
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(0);
  hspin(1)=fspin(1);
 }

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  hspin(0)=fspin(2);
  hspin(1)=fspin(3);
 }
  
-//  template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=fspin(0);
  rfspin(1)=fspin(1);
  rfspin(2)=Zero();
  rfspin(3)=Zero();
 }
-//  template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  rfspin(0)=Zero();
  rfspin(1)=Zero();
  rfspin(2)=fspin(2);
  rfspin(3)=fspin(3);
 }

+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=fspin(s);
+    rfspin(s+hN)=Zero();
+  }
+}
+template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
+{
+  const int hN = N>>1;
+  for(int s=0;s<hN;s++){
+    rfspin(s)=Zero();
+    rfspin(s+hN)=fspin(s+hN);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Reconstruction routines to move back again to four spin
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,7 +192,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(1));
@@ -191,7 +199,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesI(hspin(1));
@@ -199,7 +206,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(1));
@@ -207,7 +213,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(1));
@@ -221,7 +226,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a

 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)= hspin(1);
@@ -229,7 +233,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(1);
@@ -237,7 +240,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(1);
@@ -245,7 +247,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(1);
@@ -260,7 +261,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=timesMinusI(hspin(0));
@@ -268,7 +268,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=     timesI(hspin(0));
@@ -276,7 +275,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=timesI(hspin(0));
@@ -284,7 +282,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=timesI(hspin(0));
@@ -298,7 +295,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=hspin(0);
@@ -306,7 +302,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0);
  fspin(1)=hspin(1);
  fspin(2)=-hspin(0);
@@ -314,7 +309,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)+=hspin(0);
@@ -322,7 +316,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0);
  fspin(1)+=hspin(1);
  fspin(2)-=hspin(0);
@@ -336,7 +329,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 */
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
  fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
  fspin(2)=Zero();
@@ -344,7 +336,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)=Zero();
  fspin(1)=Zero();
  fspin(2)=hspin(0)+hspin(0);
@@ -352,7 +343,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
 }
 template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
  fspin(0)+=hspin(0)+hspin(0);
  fspin(1)+=hspin(1)+hspin(1);
 }
@@ -372,7 +362,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
 //////////
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXp(hspin._internal[i],fspin._internal[i]);
  }
@@ -426,26 +415,21 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM
    }}
 }

-
-
 ////////
 // Xm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -455,19 +439,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -476,45 +457,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconXm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconXm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
-
 ////////
 // Yp
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -524,19 +497,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -545,66 +515,55 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // Ym
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -613,19 +572,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconYm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconYm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -638,66 +594,57 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


@@ -706,62 +653,53 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconZm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconZm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -774,41 +712,35 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }


 template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
@@ -817,44 +749,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTp(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTp(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // Tm
 ////////
 template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProjTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProjTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -864,19 +789,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri

 template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
@@ -885,44 +807,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr

 template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumReconTm(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumReconTm(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
    }}
 }

-
 ////////
 // 5p
 ////////
-template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -931,19 +846,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatri

 template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -952,19 +864,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr

 template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5p(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5p(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -972,24 +881,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM
 }

 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5p(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5p(hspin._internal[i],fspin._internal[i]);
  }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1001,17 +904,17 @@ template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &
 // 5m
 ////////

-template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
+template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
  spProj5m(hspin._internal,fspin._internal);
 }
-template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
@@ -1021,40 +924,34 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatri

 template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
-    }}
+  }}
 }

 template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  accumRecon5m(hspin._internal,fspin._internal);
 }
 template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    accumRecon5m(hspin._internal[i],fspin._internal[i]);
  }
 }
 template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
@@ -1063,24 +960,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM


 // four spinor projectors for chiral proj
-//  template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
-template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
+template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
  spProj5m(hspin._internal,fspin._internal);
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
-template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
+template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
  for(int i=0;i<N;i++) {
    spProj5m(hspin._internal[i],fspin._internal[i]);
  }
 }
-//  template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
-template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
+template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
 {
-  //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
  for(int i=0;i<N;i++){ 
    for(int j=0;j<N;j++){
      spProj5m(hspin._internal[i][j],fspin._internal[i][j]);
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -53,6 +53,24 @@ namespace PeriodicBC {
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }

+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) 
+  {
+    return Cshift(adj(Link), mu, -1);
+  }
+
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
+  {
+    return Link;
+  }
+
+  template<class gauge> Lattice<gauge>
+  ShiftStaple(const Lattice<gauge> &Link, int mu)
+  {
+    return Cshift(Link, mu, 1);
+  }
+  
  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
    auto  CovShiftForward(const Lattice<gauge> &Link, 
 			  int mu,
@@ -70,6 +88,7 @@ namespace PeriodicBC {
    return CovShiftBackward(Link,mu,arg);
  }

+
 }


@@ -139,6 +158,38 @@ namespace ConjugateBC {
    //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    tmp = adj(Link);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return Cshift(tmp, mu, -1); // moves towards positive mu
+  }
+  template<class gauge> Lattice<gauge>
+  CovShiftIdentityForward(const Lattice<gauge> &Link, int mu) {
+    return Link;
+  }
+
+  template<class gauge> Lattice<gauge>
+  ShiftStaple(const Lattice<gauge> &Link, int mu)
+  {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    tmp = Cshift(Link, mu, 1);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return tmp;
+  }

  template<class gauge,class Expr,typename std::enable_if<is_lattice_expr<Expr>::value,void>::type * = nullptr>
    auto  CovShiftForward(const Lattice<gauge> &Link, 
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
  accelerator_for(sss,nloop,vobj::Nsimd(),{
    uint64_t ss = sss*Ls;
    decltype(coalescedRead(y_v[ss+sp])) tmp;
-    spProj5m(tmp,y_v(ss+sp));
-    tmp = a*x_v(ss+s)+b*tmp;
+    spProj5m(tmp,y_v(ss+sp)); 
+   tmp = a*x_v(ss+s)+b*tmp;
    coalescedWrite(z_v[ss+s],tmp);
  });
 }
@@ -188,7 +188,6 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
-  Gamma G5(Gamma::Algebra::Gamma5);
  autoView( x_v, x, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
  uint64_t nloop = grid->oSites()/Ls;
@@ -196,7 +195,13 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
    uint64_t ss = sss*Ls;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
+      auto tmp = x_v(ss+s);
+      decltype(tmp) tmp_p;
+      decltype(tmp) tmp_m;
+      spProj5p(tmp_p,tmp);
+      spProj5m(tmp_m,tmp);
+      // Use of spProj5m, 5p captures the coarse space too
+      coalescedWrite(z_v[ss+sp],tmp_p - tmp_m);
    }
  });
 }
@@ -208,10 +213,20 @@ void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

-  Gamma G5(Gamma::Algebra::Gamma5);
-  z = G5 * x;
+  autoView( x_v, x, AcceleratorRead);
+  autoView( z_v, z, AcceleratorWrite);
+  uint64_t nloop = grid->oSites();
+  accelerator_for(ss,nloop,vobj::Nsimd(),{
+    auto tmp = x_v(ss);
+    decltype(tmp) tmp_p;
+    decltype(tmp) tmp_m;
+    spProj5p(tmp_p,tmp);
+    spProj5m(tmp_m,tmp);
+    coalescedWrite(z_v[ss],tmp_p - tmp_m);
+  });
 }

+/*
 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
@@ -234,6 +249,7 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
    }
  });
 }
+*/

 NAMESPACE_END(Grid);

--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -60,11 +60,25 @@ template<class pair>
 class GpuComplex {
 public:
  pair z;
-  typedef decltype(z.x) real;
+  typedef decltype(z.x) Real;
 public: 
  accelerator_inline GpuComplex() = default;
-  accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; };
+  accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; };
  accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;};
+  accelerator_inline Real real(void) const { return z.x; };
+  accelerator_inline Real imag(void) const { return z.y; };
+  accelerator_inline GpuComplex &operator*=(const GpuComplex &r) {
+    *this = (*this) * r;
+    return *this;
+  }
+  accelerator_inline GpuComplex &operator+=(const GpuComplex &r) {
+    *this = (*this) + r;
+    return *this;
+  }
+  accelerator_inline GpuComplex &operator-=(const GpuComplex &r) {
+    *this = (*this) - r;
+    return *this;
+  }
  friend accelerator_inline  GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) { 
    GpuComplex r ; 
    r.z.x = lhs.z.x + rhs.z.x; 
@@ -157,6 +171,11 @@ typedef GpuVector<NSIMD_RealD,    double      > GpuVectorRD;
 typedef GpuVector<NSIMD_ComplexD, GpuComplexD > GpuVectorCD;
 typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;

+accelerator_inline GpuComplexF timesI(const GpuComplexF &r)     { return(GpuComplexF(-r.imag(),r.real()));}
+accelerator_inline GpuComplexD timesI(const GpuComplexD &r)     { return(GpuComplexD(-r.imag(),r.real()));}
+accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));}
+accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));}
+
 accelerator_inline float half2float(half h)
 {
  float f;
--- a/Grid/simd/Simd.h
+++ b/Grid/simd/Simd.h
@@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri
 accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
  
 //conjugate already supported for complex
-accelerator_inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
-accelerator_inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
-accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
-accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+accelerator_inline ComplexF timesI(const ComplexF &r)     { return(ComplexF(-r.imag(),r.real()));}
+accelerator_inline ComplexD timesI(const ComplexD &r)     { return(ComplexD(-r.imag(),r.real()));}
+accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));}
+accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));}
+//accelerator_inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
+//accelerator_inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
+//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
+//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}

 // define projections to real and imaginay parts
 accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));}
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@@ -64,6 +64,68 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 }
 #else

+
+#ifndef GRID_SYCL
+// Use the scalar as our own complex on GPU
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::scalar_type
+coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::scalar_type S;
+  S * __restrict__ p=(S *)&vec;
+  return p[lane];
+}
+template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::scalar_type
+coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::scalar_type S;
+
+  S * __restrict__ p=(S *)&vec;
+  int mask = vsimd::Nsimd() >> (ptype + 1);
+  int plane= doperm ? lane ^ mask : lane;
+  return p[plane];
+}
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+void coalescedWrite(vsimd & __restrict__ vec,
+		    const typename vsimd::scalar_type & __restrict__ extracted,
+		    int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::scalar_type S;
+  S * __restrict__ p=(S *)&vec;
+  p[lane]=extracted;
+}
+#else
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::vector_type::datum
+coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::vector_type::datum S;
+  S * __restrict__ p=(S *)&vec;
+  return p[lane];
+}
+template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::vector_type::datum
+coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::vector_type::datum S;
+
+  S * __restrict__ p=(S *)&vec;
+  int mask = vsimd::Nsimd() >> (ptype + 1);
+  int plane= doperm ? lane ^ mask : lane;
+  return p[plane];
+}
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+void coalescedWrite(vsimd & __restrict__ vec,
+		    const typename vsimd::vector_type::datum & __restrict__ extracted,
+		    int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::vector_type::datum S;
+  S * __restrict__ p=(S *)&vec;
+  p[lane]=extracted;
+}
+#endif
+
 //////////////////////////////////////////
 // Extract and insert slices on the GPU
 //////////////////////////////////////////
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@@ -117,7 +117,19 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 	ret._internal[b][c] -= pr * ret._internal[c1][c];
      }
    }
-	  
+  }
+
+  // Normalise last row
+  {
+    int c1 = N-1;
+    zeroit(inner);	
+    for(int c2=0;c2<N;c2++)
+      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
+
+    nrm = sqrt(inner);
+    nrm = 1.0/nrm;
+    for(int c2=0;c2<N;c2++)
+      ret._internal[c1][c2]*= nrm;
  }
  // assuming the determinant is ok
  return ret;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -1,6 +1,7 @@
 #include <Grid/GridCore.h>

 NAMESPACE_BEGIN(Grid);
+int      acceleratorAbortOnGpuError=1;
 uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -100,9 +100,11 @@ void     acceleratorInit(void);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline

+extern int acceleratorAbortOnGpuError;
+
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-  return threadIdx.z; 
+  return threadIdx.x; 
 #else
  return 0;
 #endif
@@ -110,28 +112,67 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
+    int nt=acceleratorThreads();					\
    typedef uint64_t Iterator;						\
    auto lambda = [=] accelerator					\
      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
      __VA_ARGS__;							\
    };									\
-    int nt=acceleratorThreads();					\
-    dim3 cu_threads(acceleratorThreads(),1,nsimd);			\
+    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
  }

+#define accelerator_for6dNB(iter1, num1,				\
+                            iter2, num2,				\
+                            iter3, num3,				\
+                            iter4, num4,				\
+                            iter5, num5,				\
+			    iter6, num6, ... )				\
+  {									\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator					\
+      (Iterator iter1,Iterator iter2,					\
+       Iterator iter3,Iterator iter4,					\
+       Iterator iter5,Iterator iter6) mutable {				\
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_blocks (num1,num2,num3);					\
+    dim3 cu_threads(num4,num5,num6);					\
+    Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
+  }
+
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
-  uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
-  uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
-  uint64_t z = threadIdx.z;
+  // Weird permute is to make lane coalesce for large blocks
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
+  uint64_t z = threadIdx.x;
  if ( (x < num1) && (y<num2) && (z<num3) ) {
    Lambda(x,y,z);
  }
 }

+template<typename lambda>  __global__
+void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
+		  uint64_t num4, uint64_t num5, uint64_t num6,
+		  lambda Lambda)
+{
+  uint64_t iter1 = blockIdx.x;
+  uint64_t iter2 = blockIdx.y;
+  uint64_t iter3 = blockIdx.z;
+  uint64_t iter4 = threadIdx.x;
+  uint64_t iter5 = threadIdx.y;
+  uint64_t iter6 = threadIdx.z;
+
+  if ( (iter1 < num1) && (iter2<num2) && (iter3<num3)
+    && (iter4 < num4) && (iter5<num5) && (iter6<num6) )
+  {
+    Lambda(iter1,iter2,iter3,iter4,iter5,iter6);
+  }
+}
+
 #define accelerator_barrier(dummy)					\
  {									\
    cudaDeviceSynchronize();						\
@@ -140,6 +181,7 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
      printf("Cuda error %s \n", cudaGetErrorString( err ));		\
      puts(__FILE__);							\
      printf("Line %d\n",__LINE__);					\
+      if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);		\
    }									\
  }

@@ -218,7 +260,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
      cgh.parallel_for<class dslash>(					\
      cl::sycl::nd_range<3>(global,local), \
-      [=] (cl::sycl::nd_item<3> item) mutable {       \
+      [=] (cl::sycl::nd_item<3> item) /*mutable*/ {   \
      auto iter1    = item.get_global_id(0);	      \
      auto iter2    = item.get_global_id(1);	      \
      auto lane     = item.get_global_id(2);	      \