FermToProp

Faster fermtoprop case
Open MP on host enabled
2025-06-23 10:12:02 +01:00 · 2022-11-30 15:36:35 -05:00 · 2022-11-30 15:11:29 -05:00 · 2022-11-08 13:38:54 -08:00 · 2022-11-08 13:23:40 -08:00 · 2022-11-08 13:22:57 -08:00
140 changed files with 16884 additions and 15393 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@ -44,14 +44,22 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-
+#endif
 //Eigen only
 #endif
 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@ -36,7 +36,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@ -14,7 +14,11 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
 #endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -54,7 +54,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@ -262,7 +262,7 @@ public:
 	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
-	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -264,7 +264,7 @@ public:
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
-      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+      accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -49,7 +49,6 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@ -69,7 +68,6 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@ -99,7 +97,6 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@ -133,7 +130,6 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@ -154,7 +150,6 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@ -182,9 +182,6 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -1,409 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@ -113,7 +113,43 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-};
+
  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
    int Nevec = (int)evec_coarse.size();
    int Nsrc = (int)src.size();
    // make temp variables
    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
    //Preporcessing
    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    guess_coarse[j] = Zero();
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockProject(src_coarse[j],src[j],subspace);
    }
    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
    for (int i=0;i<Nevec;i++)
    {
      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
      const CoarseField & tmp = evec_coarse[i];
      for (int j=0;j<Nsrc;j++)
      {
        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
      }
    }
    //postprocessing
    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockPromote(guess_coarse[j],guess[j],subspace);
    guess[j].Checkerboard() = src[j].Checkerboard();
    }
  };
  };
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@ -44,7 +44,6 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@ -156,7 +155,6 @@ public:
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@ -183,16 +181,8 @@ public:
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-
+  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@ -211,13 +201,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
+
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@ -295,10 +285,6 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@ -342,8 +328,6 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@ -392,31 +376,25 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
+    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@ -427,14 +405,6 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@ -29,8 +29,6 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -53,11 +53,10 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  int              _processor;       // linear processor rank
  unsigned long    _ndimension;
  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  Coordinate _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@ -98,9 +97,8 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
-  int                ProcessorCount(void)    ;
+  int                      ProcessorCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@ -144,16 +142,16 @@ public:
 		      int bytes);
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,int do_xmit,
+			       int xmit_to_rank,
 			       void *recv,
-			       int recv_from_rank,int do_recv,
+			       int recv_from_rank,
 			       int bytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,int do_xmit,
+				    int xmit_to_rank,
 				    void *recv,
-				    int recv_from_rank,int do_recv,
+				    int recv_from_rank,
 				    int bytes,int dir);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@ -124,13 +124,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-  Coordinate shm_processors       (_ndimension,1);
+
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
    shm_processors       [pad+d]=parent._shm_processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@ -155,7 +154,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@ -337,22 +335,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest, int dox,
+						     int dest,
 						     void *recv,
-						     int from, int dor,
+						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,int dox,
+							 int dest,
 							 void *recv,
-							 int from,int dor,
+							 int from,
 							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size();
@ -372,36 +370,32 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;
-  if ( dox ) {
+  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+from*32;
-      tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+    assert(ierr==0);
-      assert(ierr==0);
+    list.push_back(rrq);
-      list.push_back(rrq);
+    off_node_bytes+=bytes;
      off_node_bytes+=bytes;
    }
  }
-  
+
-  if (dor) {
+  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+_processor*32;
-      tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+    assert(ierr==0);
-      assert(ierr==0);
+    list.push_back(xrq);
-      list.push_back(xrq);
+    off_node_bytes+=bytes;
-      off_node_bytes+=bytes;
+  } else {
    } else {
    // TODO : make a OMP loop on CPU, call threaded bcopy
-      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
+    assert(shm!=NULL);
-      //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
+    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    }
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
    this->StencilSendToRecvFromComplete(list,dir);
  }
  //  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
  //    this->StencilSendToRecvFromComplete(list,dir);
  //  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -45,14 +45,12 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@ -113,18 +111,18 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,int dox,
+						     int xmit_to_rank,
 						     void *recv,
-						     int recv_from_rank,int dor,
+						     int recv_from_rank,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,int dox,
+							 int xmit_to_rank,
 							 void *recv,
-							 int recv_from_rank,int dor,
+							 int recv_from_rank,
 							 int bytes, int dir)
 {
  return 2.0*bytes;
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -93,10 +93,9 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -152,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@ -165,8 +165,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
 static inline int divides(int a,int b)
 {
@ -221,7 +221,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
    dim=(dim+1) %ndimension;
  }
 }
-void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@ -294,8 +294,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
-  SHM = ShmDims;
+
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@ -342,7 +341,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@ -354,8 +353,6 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
  SHM=ShmDims;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@ -48,10 +48,9 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
  SHM = Coordinate(processors.size(),1);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/json/json.hpp
+++ b/Grid/json/json.hpp
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -46,4 +46,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@ -1,55 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
  typedef typename vobj::tensor_reduced normtype;
  typedef typename normtype::scalar_object scalar;
  std::vector<scalar> sff;
  sliceSum(ff,sff,mu);
  for(int t=0;t<sff.size();t++){
    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
  }
 }
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
 #if defined(GRID_SYCL)
 #include <Grid/lattice/Lattice_reduction_sycl.h>
 #endif
 NAMESPACE_BEGIN(Grid);
@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
@ -136,7 +139,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
@ -145,7 +148,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
@ -155,13 +158,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu(&arg_v[0],osites);
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  typename vobj::scalar_object ssum;
  autoView( arg_v, arg, AcceleratorRead);
  ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
  arg.Grid()->GlobalSum(ssum);
@ -171,7 +174,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_gpu_large(&arg_v[0],osites);
@ -235,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
-
+    // This code could read coalesce
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, 1,{
 	auto x_l = left_v[ss];
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@ -0,0 +1,125 @@
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
  sobj identity; zeroit(identity);
  sobj ret ; 
  Integer nsimd= vobj::Nsimd();
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
     cgh.parallel_for(cl::sycl::range<1>{osites},
 		      Reduction,
 		      [=] (cl::sycl::id<1> item, auto &sum) {
      auto osite   = item[0];
      sum +=Reduce(lat[osite]);
     });
   });
  theGridAccelerator->wait();
  ret = mysum[0];
  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  return sumD_gpu_tensor(lat,osites);
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
 {
  return sumD_gpu_large(lat,osites);
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  return sumD_gpu_large(lat,osites);
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu(lat,osites);
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
 /*
 template<class Double> Double svm_reduce(Double *vec,uint64_t L)
 {
  Double sumResult; zeroit(sumResult);
  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
  Double identity;  zeroit(identity);
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
 	 sum +=vec[index];
     });
   });
  theGridAccelerator->wait();
  Double ret = d_sum[0];
  free(d_sum,*theGridAccelerator);
  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_type  scalar;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobjD;
  sobjD ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int nsimd = vobj::Nsimd();
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<scalar> buffer(osites*nsimd);
  scalar *buf = &buffer[0];
  vector *dat = (vector *)lat;
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,nsimd,{
 	int lane = acceleratorSIMTlane(nsimd);
 	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
    });
    //Precision change at this point is to late to gain precision
    ret_p[w] = svm_reduce(buf,nsimd*osites);
  }
  return ret;
 }
 */
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@ -69,7 +69,6 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@ -80,7 +79,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@ -89,8 +87,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@ -182,7 +182,6 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@ -31,6 +31,7 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include <map>
 #include <pwd.h>
@ -654,7 +655,8 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    ildgfmt.field     = std::string("su3gauge");
+    const std::string stNC = std::to_string( Nc ) ;
    ildgfmt.field          = std::string("su"+stNC+"gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@ -871,7 +873,8 @@ class IldgReader : public GridLimeReader {
    } else { 
      assert(found_ildgFormat);
-      assert ( ildgFormat_.field == std::string("su3gauge") );
+      const std::string stNC = std::to_string( Nc ) ;
      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@ -879,7 +882,7 @@ class IldgReader : public GridLimeReader {
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
+      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@ -6,8 +6,8 @@
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  const int x=0;
+  assert( Nc < 4 && Nc > 1 ) ;
  const int y=1;
  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+    #if Nc == 2
-    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
-    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
    #else
      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #endif
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@ -278,7 +282,6 @@ struct GaugeSimpleMunger{
 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@ -317,8 +320,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@ -330,8 +333,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@ -9,6 +9,7 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -30,6 +31,8 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <string>
 NAMESPACE_BEGIN(Grid);
 using namespace Grid;
@ -39,10 +42,8 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
-  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
+  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
@ -147,15 +148,17 @@ public:
    std::string format(header.floating_point);
-    int ieee32big = (format == std::string("IEEE32BIG"));
+    const int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
+    const int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
+    const int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
+    const int ieee64    = (format == std::string("IEEE64") || \
 			   format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
+    const std::string stNC = std::to_string( Nc ) ;
    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@ -166,7 +169,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
+    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@ -200,7 +203,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
@ -211,27 +214,29 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label);
+    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
-    ///////////////////////////////////////////
+    header.sequence_number = sequence_number;
-    // Following should become arguments
+    header.ensemble_id     = ens_id;
    ///////////////////////////////////////////
    header.sequence_number = 1;
    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
    header.hdr_version     = "1.0" ;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@ -245,10 +250,14 @@ public:
    uint64_t offset;
-    // Sod it -- always write 3x3 double
+    // Sod it -- always write NcxNc double
-    header.floating_point = std::string("IEEE64BIG");
+    header.floating_point  = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+    const std::string stNC = std::to_string( Nc ) ;
-    GaugeSimpleUnmunger<fobj3D,sobj> munge;
+    if( two_row ) {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
    } else {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@ -256,8 +265,15 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+    if( two_row ) {
-					      nersc_csum,scidac_csuma,scidac_csumb);
+      Gauge3x2unmunger<fobj2D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    } else {
      GaugeSimpleUnmunger<fobj3D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@ -289,8 +305,7 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);
-	uint64_t offset;
+    uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@ -330,7 +345,7 @@ public:
    GridBase *grid = parallel.Grid();
-	uint64_t offset = readHeader(file,grid,header);
+    uint64_t offset = readHeader(file,grid,header);
    FieldMetaData clone(header);
--- a/Grid/pugixml/pugixml.cc
+++ b/Grid/pugixml/pugixml.cc
@ -16,8 +16,12 @@
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
 #endif
 #include "pugixml.h"
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -63,7 +63,6 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@ -88,8 +87,6 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@ -113,10 +110,8 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@ -181,16 +176,6 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@ -235,16 +220,6 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@ -476,9 +451,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
 //template <class Prop, class Ferm>
 #define FAST_FERM_TO_PROP
 template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
  autoView(p_v,p,AcceleratorWrite);
  autoView(f_v,f,AcceleratorRead);
  accelerator_for(idx,p_v.oSites(),1,{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
      }}
    });
 #else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@ -490,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 	}
      pokeSpin(p, pjs, j, s);
    }
 #endif
 }
 //template <class Prop, class Ferm>
 template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
  autoView(p_v,p,AcceleratorWrite);
  autoView(f_v,f,AcceleratorRead);
  accelerator_for(idx,p_v.oSites(),1,{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
      }}
    });
 #else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@ -507,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 	}
      pokeSpin(f, fj, j);
    }
 #endif
 }
 //////////////////////////////////////////////
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -40,29 +40,6 @@ class Action
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
  }
  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return deriv_us; };
  RealD refresh_timer(void)      { return deriv_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@ -37,10 +37,6 @@ NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
 NAMESPACE_CHECK(ActionParams);
 #include <Grid/qcd/action/filters/MomentumFilter.h>
 #include <Grid/qcd/action/filters/DirichletFilter.h>
 #include <Grid/qcd/action/filters/DDHMCFilter.h>
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -63,7 +63,6 @@ struct StaggeredImplParams {
 				    RealD, hi, 
 				    int,   MaxIter, 
 				    RealD, tolerance, 
 				    RealD, mdtolerance, 
 				    int,   degree, 
 				    int,   precision,
 				    int,   BoundsCheckFreq);
@ -77,13 +76,11 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20,
+				int _BoundsCheckFreq=20)
 				RealD mdtol    = 1.0e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
        mdtolerance(mdtol),
 	degree(_degree),
        precision(_precision),
        BoundsCheckFreq(_BoundsCheckFreq){};
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -68,9 +68,16 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  virtual RealD Mass(void) { return mass; };
+  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
-    mass=_mass; 
+    mass_plus=mass_minus=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
    mass_plus=_mass_plus;
    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@ -108,7 +115,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
  //    protected:
-  RealD mass;
+  RealD mass_plus, mass_minus;
  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@ -0,0 +1,333 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 ////////////////////////////////////////////
 // Standard Clover
 //   (4+m0) + csw * clover_term
 // Exp Clover
 //   (4+m0) * exp(csw/(4+m0) clover_term)
 //   = (4+m0) + csw * clover_term + ...
 ////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////
 // Generic Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
    GridBase *grid = CloverTerm.Grid();
    CloverTerm += diag_mass;
    int lvol = grid->lSites();
    int DimRep = Impl::Dimension;
    {
      autoView(CTv,CloverTerm,CpuRead);
      autoView(CTIv,CloverTermInv,CpuWrite);
      thread_for(site, lvol, {
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
        peekLocalSite(Qx, CTv, lcoor);
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++){
                auto zz =  Qx()(j, k)(a, b);
                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
              }
        EigenInvCloverOp = EigenCloverOp.inverse();
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++)
                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
               pokeLocalSite(Qxinv, CTIv, lcoor);
      });
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Generic Exp Clover
 //////////////////////////////////
 template<class Impl>
 class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef WilsonCloverHelpers<Impl> Helpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 //////////////////////////////////
 // Compact Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
                            public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    Clover += diag_mass;
  }
  static void InvertClover(CloverField& InvClover,
                            const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverDiagonalField&       diagonalInv,
                            CloverTriangleField&       triangleInv,
                            bool fixedBoundaries) {
    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
  }
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Compact Exp Clover
 //////////////////////////////////
 template<class Impl>
 class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static void InvertClover(CloverField& InvClover,
                            const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverDiagonalField&       diagonalInv,
                            CloverTriangleField&       triangleInv,
                            bool fixedBoundaries) {
    if (fixedBoundaries)
    {
      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
    }
    else
    {
      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@ -31,6 +31,7 @@
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@ -85,7 +86,7 @@ NAMESPACE_BEGIN(Grid);
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
-template<class Impl>
+template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
@ -224,7 +225,7 @@ public:
  RealD csw_t;
  RealD cF;
-  bool open_boundaries;
+  bool fixedBoundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -138,38 +138,52 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
-typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
-typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
-typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
-typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
-typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
+template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
-typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
+template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
-typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
+typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
-typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
+typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
-typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
+typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
+typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
+typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
+typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@ -49,8 +49,6 @@ public:
  virtual FermionField &tmp(void) = 0;
  virtual void DirichletBlock(Coordinate & _Block) { assert(0); };
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@ -30,18 +30,6 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@ -125,7 +113,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@ -151,10 +139,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
+
      if ( sl == 2 ) {
-	//Only do the twist for lanes on the edge of the physical node
+       
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@ -209,19 +197,6 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@ -232,19 +207,14 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-
+        
-    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    for(int mu=0;mu<Nd;mu++){
-    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+          
-    for(int mu=0;mu<Nd-1;mu++){
+      LatticeCoordinate(coor,mu);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@ -259,7 +229,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	});
+	  });
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@ -290,38 +260,6 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@ -360,48 +298,28 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls=Btilde.Grid()->_fdimensions[0];
    {
      GridBase *GaugeGrid = mat.Grid();
      Lattice<iScalar<vInteger> > coor(GaugeGrid);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      autoView( mat_v , mat, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls = Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
      autoView( tmp_v , tmp, CpuWrite);
      autoView( Atilde_v , Atilde, CpuRead);
      autoView( Btilde_v , Btilde, CpuRead);
      thread_for(ss,tmp.Grid()->oSites(),{
 	  for (int s = 0; s < Ls; s++) {
 	    int sF = s + Ls * ss;
 	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
 	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
 	  }
 	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@ -32,6 +32,7 @@
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@ -51,7 +52,7 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
-template <class Impl>
+template<class Impl, class CloverHelpers>
 class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@ -209,6 +209,8 @@ public:
 };
 ////////////////////////////////////////////////////////
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@ -47,8 +47,6 @@ class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -117,19 +117,19 @@ public:
    typedef decltype(coalescedRead(*in))    sobj;
    typedef decltype(coalescedRead(*out0)) hsobj;
-    unsigned int Nsimd = vobj::Nsimd();
+    constexpr unsigned int Nsimd = vobj::Nsimd();
    unsigned int mask = Nsimd >> (type + 1);
    int lane = acceleratorSIMTlane(Nsimd);
    int j0 = lane &(~mask); // inner coor zero
    int j1 = lane |(mask) ; // inner coor one
-    const vobj *vp0 = &in[k];
+    const vobj *vp0 = &in[k];  // out0[j] = merge low bit of type from in[k] and in[m] 
-    const vobj *vp1 = &in[m];
+    const vobj *vp1 = &in[m];  // out1[j] = merge hi  bit of type from in[k] and in[m]
-    const vobj *vp = (lane&mask) ? vp1:vp0;
+    const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
-    auto sa = coalescedRead(*vp,j0);
+    auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
-    auto sb = coalescedRead(*vp,j1);
+    auto sb = coalescedRead(*vp,j1); // lane to read for out 1
    hsobj psa, psb;
-    projector::Proj(psa,sa,mu,dag);
+    projector::Proj(psa,sa,mu,dag);  // spin project the result0
-    projector::Proj(psb,sb,mu,dag);
+    projector::Proj(psb,sb,mu,dag);  // spin project the result1
    coalescedWrite(out0[j],psa);
    coalescedWrite(out1[j],psb);
 #else
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -75,10 +75,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  int Dirichlet;
  Coordinate Block; 
  /********** Deprecate timers **********/
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
@ -177,18 +173,7 @@ public:
 		  GridCartesian         &FourDimGrid,
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
-
+    
  virtual void DirichletBlock(Coordinate & block)
  {
    assert(block.size()==Nd+1);
    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
      Dirichlet = 1;
      Block = block;
      Stencil.DirichletBlock(block); 
      StencilEven.DirichletBlock(block); 
      StencilOdd.DirichletBlock(block);
    }
  }
  // Constructors
  /*
    WilsonFermion5D(int simd, 
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass(_mass)
+  mass_plus(_mass), mass_minus(_mass)
 { 
 }
@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass*cee[Ls-1];
+      lower[s] = mass_minus*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass*cee[0];
+      upper[s] = mass_plus*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_plus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass*cs[Ls-1];
+      lower[s] =-mass_minus*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass*cs[0];
+      upper[s] =-mass_plus*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      leem[i]=mass*cee[Ls-1]/bee[0];
+      leem[i]=mass_minus*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      ueem[i]=mass;
+      ueem[i]=mass_plus;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
  { 
-    Coeff_t delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass_minus*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@ -642,6 +642,10 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@ -777,6 +781,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@ -32,22 +32,23 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
-template<class Impl>
+template<class Impl, class CloverHelpers>
-CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
+CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
-                                                             GridCartesian& Fgrid,
+                                                                            GridCartesian& Fgrid,
-                                                             GridRedBlackCartesian& Hgrid,
+                                                                            GridRedBlackCartesian& Hgrid,
-                                                             const RealD _mass,
+                                                                            const RealD _mass,
-                                                             const RealD _csw_r,
+                                                                            const RealD _csw_r,
-                                                             const RealD _csw_t,
+                                                                            const RealD _csw_t,
-                                                             const RealD _cF,
+                                                                            const RealD _cF,
-                                                             const WilsonAnisotropyCoefficients& clover_anisotropy,
+                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
-                                                             const ImplParams& impl_p)
+                                                                            const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
-  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
@ -58,80 +59,85 @@ CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
-  if (open_boundaries)
+  if (fixedBoundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
-  if(this->open_boundaries) ApplyBoundaryMask(out);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
-  if(this->open_boundaries) {
+  if(this->fixedBoundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
@ -141,16 +147,16 @@ void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionFiel
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
@ -160,27 +166,27 @@ void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionF
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
-  assert(!open_boundaries); // TODO check for changes required for open bc
+  assert(!fixedBoundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
@ -251,7 +257,7 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
@ -261,18 +267,18 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
  force += clover_force;
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
@ -285,8 +291,8 @@ void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
@ -299,6 +305,7 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  CloverField TmpInverse(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
@ -318,22 +325,30 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
-  TmpOriginal += this->diag_mass;
+
  // Instantiate the clover term
  // - In case of the standard clover the mass term is added
  // - In case of the exponential clover the clover term is exponentiated
  double t4 = usecond();
  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
  // Convert the data layout of the clover term
-  double t4 = usecond();
+  double t5 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
-  // Possible modify the boundary values
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
  double t5 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the clover term in the improved layout
  double t6 = usecond();
-  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the Clover term
  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
  // TODO: For now this inversion is explictly done on the CPU
  double t7 = usecond();
  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
  // Fill the remaining clover fields
-  double t7 = usecond();
+  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
@ -344,20 +359,19 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
-  double t8 = usecond();
+  double t9 = usecond();
-#if 0
+
-  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
-            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
-            << ", allocations = "               << (t2 - t1) / 1e6
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
-            << ", field strength = "            << (t3 - t2) / 1e6
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
-            << ", fill clover = "               << (t4 - t3) / 1e6
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-            << ", convert = "                   << (t5 - t4) / 1e6
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
-            << ", boundaries = "                << (t6 - t5) / 1e6
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
-            << ", inversions = "                << (t7 - t6) / 1e6
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
-            << ", pick cbs = "                  << (t8 - t7) / 1e6
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
-            << ", total = "                     << (t8 - t0) / 1e6
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
-            << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 #endif
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@ -34,8 +34,8 @@
 NAMESPACE_BEGIN(Grid);
-template<class Impl>
+template<class Impl, class CloverHelpers>
-WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
+WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
@ -74,8 +74,8 @@ WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&
 }
 // *NOT* EO
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@ -89,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@ -104,8 +104,8 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
@ -131,47 +131,11 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
+   
  double t4 = usecond();
-  int lvol = _Umu.Grid()->lSites();
+  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
  int DimRep = Impl::Dimension;
  double t5 = usecond();
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    thread_for(site, lvol, {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++){
 	      auto zz =  Qx()(j, k)(a, b);
 	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 	    }
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
      EigenInvCloverOp = EigenCloverOp.inverse();
      //std::cout << EigenInvCloverOp << std::endl;
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
    });
  }
  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@ -184,48 +148,44 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-  double t7 = usecond();
+  double t6 = usecond();
-#if 0
+  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
-  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
-            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
-            << ", allocations = "               << (t2 - t1) / 1e6
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
-            << ", field strength = "            << (t3 - t2) / 1e6
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-            << ", fill clover = "               << (t4 - t3) / 1e6
+  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
-            << ", misc = "                      << (t5 - t4) / 1e6
+  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
-            << ", inversions = "                << (t6 - t5) / 1e6
+  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
            << ", pick cbs = "                  << (t7 - t6) / 1e6
            << ", total = "                     << (t7 - t0) / 1e6
            << std::endl;
 #endif
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
  CloverField *Clover;
@ -278,8 +238,8 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
@ -349,7 +309,7 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
@ -360,15 +320,15 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -60,8 +60,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid),
+  _tmp(&FiveDimRedBlackGrid)
  Dirichlet(0)
 {
  // some assertions
  assert(FiveDimGrid._ndimension==5);
@ -219,14 +218,6 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
    std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
    Coordinate GaugeBlock(Nd);
    for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
    std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
    DirichletFilter<GaugeField> Filter(GaugeBlock);
    Filter.applyFilter(HUmu);
  }
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2015
+Copyright (C) 2022
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -599,11 +600,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp_shifted(UGrid);
  PropagatorField g5Lg5(UGrid);
  PropagatorField R(UGrid);
  PropagatorField gmuR(UGrid);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  g5Lg5=g5*q_in_1*g5;
  tmp_shifted=Cshift(q_in_2,mu,1);
  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
  gmuR=gmu*R;
  q_out=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
  tmp_shifted=Cshift(q_in_1,mu,1);
  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
  g5Lg5=g5*g5Lg5*g5;
  R=q_in_2;
  gmuR=gmu*R;
  q_out-=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
 }
@ -617,9 +654,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  PropagatorField L(UGrid);
  PropagatorField zz (UGrid);
  zz=Zero();
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  tmp = Cshift(q_in,mu,1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
  tmp = q_in *lattice_cmplx;
  tmp = Cshift(tmp,mu,-1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
  tmp = -( Utmp + gmu*Utmp );
  // Mask the time
  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
    unsigned int t0 = 0;
    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
  } else {
    tmp = where((lcoor>=tmin+tshift),tmp,zz);
  }
  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -498,6 +498,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
     acceleratorFenceComputeStream();
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
@ -505,11 +506,13 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
     acceleratorFenceComputeStream();
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
@ -9,6 +9,7 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -32,10 +33,12 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class CompactWilsonCloverFermion<IMPLEMENTATION>; 
+template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
@ -8,7 +8,8 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@ -31,10 +32,12 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION>; 
+template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
 template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@ -18,6 +18,10 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "
 COMPACT_WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD "
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@ -40,13 +44,23 @@ EOF
 done
-CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
 for impl in $WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
 CC_LIST="CompactWilsonCloverFermionInstantiation"
 for impl in $COMPACT_WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
@ -63,14 +77,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
 # overwrite the .cc file in Gparity directories
 for impl in $GDWF_IMPL_LIST
 do
-  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc 
+  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
 done
@ -84,7 +98,7 @@ for impl in $STAG_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
--- a/Grid/qcd/action/filters/DDHMCFilter.h
+++ b/Grid/qcd/action/filters/DDHMCFilter.h
@ -1,102 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // DDHMC filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 template<typename GaugeField>
 struct DDHMCFilter: public MomentumFilterBase<GaugeField>
 {
  Coordinate Block;
  int Width;
  DDHMCFilter(const Coordinate &_Block,int _Width=2): Block(_Block) { Width=_Width; }
  void applyFilter(GaugeField &U) const override
  {
    GridBase *grid = U.Grid();
    Coordinate Global=grid->GlobalDimensions();
    GaugeField zzz(grid); zzz = Zero();
    LatticeInteger coor(grid); 
    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
    ////////////////////////////////////////////////////
    // Zero BDY layers
    ////////////////////////////////////////////////////
    std::cout<<GridLogMessage<<" DDHMC Force Filter Block "<<Block<<" width " <<Width<<std::endl;
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	LatticeCoordinate(coor,mu);
 	////////////////////////////////
 	// OmegaBar - zero all links contained in slice B-1,0 and
 	// mu links connecting to Omega
 	////////////////////////////////
 	if ( Width==1) { 
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
 	if ( Width==2) { 
 	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
 	if ( Width==3) { 
 	  U    = where(mod(coor,B1)==Integer(B1-3),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(2)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/filters/DirichletFilter.h
+++ b/Grid/qcd/action/filters/DirichletFilter.h
@ -1,71 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct DirichletFilter: public MomentumFilterBase<MomentaField>
 {
  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
  typedef iScalar<iScalar<iScalar<vector_type> > >            ScalarType; //complex phase for each site
  Coordinate Block;
  DirichletFilter(const Coordinate &_Block): Block(_Block){}
  void applyFilter(MomentaField &P) const override
  {
    GridBase *grid = P.Grid();
    typedef decltype(PeekIndex<LorentzIndex>(P, 0)) LatCM;
    ////////////////////////////////////////////////////
    // Zero strictly links crossing between domains
    ////////////////////////////////////////////////////
    LatticeInteger coor(grid); 
    LatCM zz(grid); zz = Zero();
    for(int mu=0;mu<Nd;mu++) {
      if ( (Block[mu]) && (Block[mu] < grid->GlobalDimensions()[mu] ) ) {
 	// If costly could provide Grid earlier and precompute masks
 	std::cout << " Dirichlet in mu="<<mu<<std::endl;
 	LatticeCoordinate(coor,mu);
 	auto P_mu = PeekIndex<LorentzIndex>(P, mu);
 	P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
 	PokeIndex<LorentzIndex>(P, P_mu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid);
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
-    typedef Field              ComplexField;
+    typedef LinkField          ComplexField;
  };
  typedef QedGImpl<vComplex> QedGImplR;
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@ -13,31 +13,6 @@ NAMESPACE_BEGIN(Grid);
      std::cout << GridLogMessage << "Pseudofermion action lamda_max "<<lambda_max<<"( bound "<<hi<<")"<<std::endl;
      assert( (lambda_max < hi) && " High Bounds Check on operator failed" );
    }
     template<class Field> void ChebyBoundsCheck(LinearOperatorBase<Field> &HermOp,
 						 Field &GaussNoise,
 						 RealD lo,RealD hi) 
    {
      int orderfilter = 1000;
      Chebyshev<Field> Cheb(lo,hi,orderfilter);
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Z(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      Cheb(HermOp,X,Z);
      RealD Nz = norm2(Z);
      std::cout << "************************* "<<std::endl;
      std::cout << " noise                    = "<<Nx<<std::endl;
      std::cout << " Cheb x noise             = "<<Nz<<std::endl;
      std::cout << " Ratio                    = "<<Nz/Nx<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( ((Nz/Nx)<1.0) && " ChebyBoundsCheck ");
    }
    template<class Field> void InverseSqrtBoundsCheck(int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
--- a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h
@ -1,163 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundaryBoson.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), 
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_NumOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag P^dag P phi}
    //
    // NumOp == P
    //
    // Take phi = P^{-1} eta  ; eta = P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField eta(NumOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    //DumpSliceNorm("eta",eta);
    NumOp.RInv(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField Y(NumOp.FermionGrid());
    NumOp.R(Phi,Y);
    RealD action = norm2(Y);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=DerivativeStoppingCondition;
    NumOp.ImportGauge(U);
    GridBase *fgrid = NumOp.FermionGrid();
    GridBase *ugrid = NumOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DoidP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // P^dag P term
    NumOp.dOmegaDagInv(P_Phi,DoidP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidP_Phi,tmp);
    X = DobiDdbPhi;
    Y = DobidDddDoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h
@ -1,158 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  RealD InnerStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
  RealD refresh_action;
 public:
  DomainDecomposedBoundaryTwoFlavourPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol = 1.0e-6 )
    : DenOp(_DenOp),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_DenOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag Rdag^-1 R^-1 phi}
    //
    // DenOp == R
    //
    // Take phi = R eta  ; eta = R^-1 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol     =ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField eta(DenOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    DenOp.ProjectBoundaryBar(eta);
    DenOp.R(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
    refresh_action = norm2(eta);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Rdag^-1 R^-1 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField X(DenOp.FermionGrid());
    DenOp.RInv(Phi,X);
    RealD action = norm2(X);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=DerivativeStoppingCondition;
    DenOp.ImportGauge(U);
    GridBase *fgrid = DenOp.FermionGrid();
    GridBase *ugrid = DenOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DiDdb_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinv_Phi(fgrid);    // Vector D in my notes
    FermionField Rinv_Phi(fgrid);
 //   FermionField RinvDagRinv_Phi(fgrid);
 //   FermionField DdbdDidRinv_Phi(fgrid);
    // R^-1 term
    DenOp.dBoundaryBar(Phi,tmp);
    DenOp.Dinverse(tmp,DiDdb_Phi);            // Vector C
    Rinv_Phi = Phi - DiDdb_Phi;
    DenOp.ProjectBoundaryBar(Rinv_Phi); 
    // R^-dagger R^-1 term
    DenOp.DinverseDag(Rinv_Phi,DidRinv_Phi); // Vector D
 /*
    DenOp.dBoundaryBarDag(DidRinv_Phi,DdbdDidRinv_Phi);
    RinvDagRinv_Phi = Rinv_Phi - DdbdDidRinv_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinv_Phi);
 */
    X = DiDdb_Phi;
    Y = DidRinv_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    DumpSliceNorm("force",dSdU);
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h
@ -1,237 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp, 
 						       SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,
 						       RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), DenOp(_DenOp),
      Phi(_NumOp.PeriodicFermOpD.FermionGrid()),
      InnerStoppingCondition(_InnerTol),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol)
  {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField eta(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField tmp(NumOp.PeriodicFermOpD.FermionGrid());
    // P(phi) = e^{- phi^dag P^dag Rdag^-1 R^-1 P phi}
    //
    // NumOp == P
    // DenOp == R
    //
    // Take phi = P^{-1} R eta  ; eta = R^-1 P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    DenOp.R(eta,tmp);
    NumOp.RInv(tmp,Phi);
    DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag Rdag^-1 R^-1 P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField X(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField Y(NumOp.PeriodicFermOpD.FermionGrid());
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    NumOp.R(Phi,Y);
    DenOp.RInv(Y,X);
    RealD action = norm2(X);
    //    std::cout << " DD boundary action is " <<action<<std::endl;
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    GridBase *fgrid = NumOp.PeriodicFermOpD.FermionGrid();
    GridBase *ugrid = NumOp.PeriodicFermOpD.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DiDdbP_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinvP_Phi(fgrid);    // Vector D in my notes
    FermionField DdbdDidRinvP_Phi(fgrid);
    FermionField DoidRinvDagRinvP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidRinvDagRinvP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    FermionField RinvP_Phi(fgrid);
    FermionField RinvDagRinvP_Phi(fgrid);
    FermionField PdagRinvDagRinvP_Phi(fgrid);
    //    RealD action = S(U);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = DerivativeStoppingCondition;
    NumOp.tol = DerivativeStoppingCondition;
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // R^-1 P term
    DenOp.dBoundaryBar(P_Phi,tmp);
    DenOp.Dinverse(tmp,DiDdbP_Phi);            // Vector C
    RinvP_Phi = P_Phi - DiDdbP_Phi;
    DenOp.ProjectBoundaryBar(RinvP_Phi); // Correct to here
    // R^-dagger R^-1 P term
    DenOp.DinverseDag(RinvP_Phi,DidRinvP_Phi); // Vector D
    DenOp.dBoundaryBarDag(DidRinvP_Phi,DdbdDidRinvP_Phi);
    RinvDagRinvP_Phi = RinvP_Phi - DdbdDidRinvP_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinvP_Phi);
    // P^dag R^-dagger R^-1 P term
    NumOp.dOmegaDagInv(RinvDagRinvP_Phi,DoidRinvDagRinvP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidRinvDagRinvP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidRinvDagRinvP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidRinvDagRinvP_Phi,tmp);
    PdagRinvDagRinvP_Phi = RinvDagRinvP_Phi- tmp;
    NumOp.ProjectBoundaryBar(PdagRinvDagRinvP_Phi);
    /*
    std::cout << "S eval  "<< action << std::endl;
    std::cout << "S - IP1 "<< innerProduct(Phi,PdagRinvDagRinvP_Phi) << std::endl;
    std::cout << "S - IP2 "<< norm2(RinvP_Phi) << std::endl;
    NumOp.R(Phi,tmp);
    tmp = tmp - P_Phi;
    std::cout << "diff1 "<<norm2(tmp) <<std::endl;
    DenOp.RInv(P_Phi,tmp);
    tmp = tmp - RinvP_Phi;
    std::cout << "diff2 "<<norm2(tmp) <<std::endl;
    DenOp.RDagInv(RinvP_Phi,tmp);
    tmp  = tmp - RinvDagRinvP_Phi;
    std::cout << "diff3 "<<norm2(tmp) <<std::endl;
    DenOp.RDag(RinvDagRinvP_Phi,tmp);
    tmp  = tmp - PdagRinvDagRinvP_Phi;
    std::cout << "diff4 "<<norm2(tmp) <<std::endl;
    */
    dSdU=Zero();
    X = DobiDdbPhi;
    Y = DobidDddDoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DiDdbP_Phi;
    Y = DidRinvP_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@ -59,7 +59,6 @@ NAMESPACE_BEGIN(Grid);
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
      FermionField Noise; // spare noise field for bounds check
    public:
@ -71,7 +70,6 @@ NAMESPACE_BEGIN(Grid);
      DenOp(_DenOp), 
      PhiOdd (_NumOp.FermionRedBlackGrid()),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      Noise(_NumOp.FermionRedBlackGrid()),
      param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
@ -89,11 +87,7 @@ NAMESPACE_BEGIN(Grid);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
-      virtual std::string action_name(){
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
 	std::stringstream sstream;
 	sstream<< "OneFlavourEvenOddRatioRationalPseudoFermionAction det("<< DenOp.Mass() << ") / det("<<NumOp.Mass()<<")";
 	return sstream.str();
      }
      virtual std::string LogParameters(){
 	std::stringstream sstream;
@ -134,7 +128,6 @@ NAMESPACE_BEGIN(Grid);
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	Noise = etaOdd;
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
@ -182,10 +175,9 @@ NAMESPACE_BEGIN(Grid);
        grid->Broadcast(0,r);
        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
-	  gauss = Noise;
+	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
 	  ChebyBoundsCheck(MdagM,Noise,param.lo,param.hi);
 	}
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
--- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@ -49,12 +49,10 @@ NAMESPACE_BEGIN(Grid);
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
      MultiShiftFunction MDPowerQuarter;
      MultiShiftFunction MDPowerNegHalf;
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
@ -81,10 +79,6 @@ NAMESPACE_BEGIN(Grid);
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
 	// Derive solves different tol
   	MDPowerQuarter.Init(remez,param.mdtolerance,false);
 	MDPowerNegHalf.Init(remez,param.mdtolerance,true);
      };
      virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
@ -210,8 +204,8 @@ NAMESPACE_BEGIN(Grid);
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-	const int n_f  = MDPowerNegHalf.poles.size();
+	const int n_f  = PowerNegHalf.poles.size();
-	const int n_pv = MDPowerQuarter.poles.size();
+	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid());
@ -230,8 +224,8 @@ NAMESPACE_BEGIN(Grid);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,MDPowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,MDPowerNegHalf);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
@ -250,7 +244,7 @@ NAMESPACE_BEGIN(Grid);
 	//(1)
 	for(int k=0;k<n_f;k++){
-	  ak = MDPowerNegHalf.residues[k];
+	  ak = PowerNegHalf.residues[k];
 	  DenOp.M(MfMpvPhi_k[k],Y);
 	  DenOp.MDeriv(tmp , MfMpvPhi_k[k], Y,DaggerYes );  dSdU=dSdU+ak*tmp;
 	  DenOp.MDeriv(tmp , Y, MfMpvPhi_k[k], DaggerNo );  dSdU=dSdU+ak*tmp;
@ -260,7 +254,7 @@ NAMESPACE_BEGIN(Grid);
 	//(3)
 	for(int k=0;k<n_pv;k++){
-          ak = MDPowerQuarter.residues[k];
+          ak = PowerQuarter.residues[k];
 	  NumOp.M(MpvPhi_k[k],Y);
 	  NumOp.MDeriv(tmp,MpvMfMpvPhi_k[k],Y,DaggerYes); dSdU=dSdU+ak*tmp;
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@ -75,15 +75,11 @@ NAMESPACE_BEGIN(Grid);
          conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
        };
-      virtual std::string action_name(){
+      virtual std::string action_name(){return "TwoFlavourEvenOddRatioPseudoFermionAction";}
 	std::stringstream sstream;
 	sstream<<"TwoFlavourEvenOddRatioPseudoFermionAction det("<<DenOp.Mass()<<") / det("<<NumOp.Mass()<<")";
 	return sstream.str();
      }
      virtual std::string LogParameters(){
 	std::stringstream sstream;
-	sstream<< GridLogMessage << "["<<action_name()<<"] -- No further parameters "<<std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
 	return sstream.str();
      } 
--- a/Grid/qcd/action/pseudofermion/TwoFlavourRatioEO4DPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatioEO4DPseudoFermion.h
@ -1,203 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class Impl>
 class TwoFlavourRatioEO4DPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);
 private:
  typedef FermionOperator<Impl> FermOp;
  FermionOperator<Impl> & NumOp;// the basic operator
  FermionOperator<Impl> & DenOp;// the basic operator
  OperatorFunction<FermionField> &DerivativeSolver;
  OperatorFunction<FermionField> &DerivativeDagSolver;
  OperatorFunction<FermionField> &ActionSolver;
  OperatorFunction<FermionField> &HeatbathSolver;
  FermionField phi4; // the pseudo fermion field for this trajectory
 public:
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & AS ) : 
    TwoFlavourRatioEO4DPseudoFermionAction(_NumOp,_DenOp, DS,DS,AS,AS) {};
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & DDS,
 					 OperatorFunction<FermionField> & AS,
 					 OperatorFunction<FermionField> & HS
 				       ) : NumOp(_NumOp),
 					   DenOp(_DenOp),
 					   DerivativeSolver(DS),
 					   DerivativeDagSolver(DDS),
 					   ActionSolver(AS),
 					   HeatbathSolver(HS),
 					   phi4(_NumOp.GaugeGrid())
  {};
  virtual std::string action_name(){return "TwoFlavourRatioEO4DPseudoFermionAction";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
    // P(phi) = e^{- phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi}
    //
    // NumOp == V
    // DenOp == M
    //
    // Take phi = (V^{-1} M)_11 eta  ; eta = (M^{-1} V)_11 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    FermionField eta4(NumOp.GaugeGrid());
    FermionField eta5(NumOp.FermionGrid());
    FermionField tmp(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    gaussian(pRNG,eta4);
    NumOp.ImportFourDimPseudoFermion(eta4,eta5);
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(HeatbathSolver);
    DenOp.M(eta5,tmp);               // M eta
    PrecSolve(NumOp,tmp,phi5);  // phi = V^-1 M eta
    phi5=phi5*scale;
    std::cout << GridLogMessage << "4d pf refresh "<< norm2(phi5)<<"\n";
    // Project to 4d
    NumOp.ExportFourDimPseudoFermion(phi5,phi4);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField Y4(NumOp.GaugeGrid());
    FermionField X(NumOp.FermionGrid());
    FermionField Y(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(ActionSolver);
    NumOp.ImportFourDimPseudoFermion(phi4,phi5);
    NumOp.M(phi5,X);              // X= V phi
    PrecSolve(DenOp,X,Y);    // Y= (MdagM)^-1 Mdag Vdag phi = M^-1 V phi
    NumOp.ExportFourDimPseudoFermion(Y,Y4);
    RealD action = norm2(Y4);
    return action;
  };
  //////////////////////////////////////////////////////
  // dS/du = 2 Re phi^dag (V^dag M^-dag)_11  (M^-1 d V)_11  phi
  //       - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField  X(NumOp.FermionGrid());
    FermionField  Y(NumOp.FermionGrid());
    FermionField       phi(NumOp.FermionGrid());
    FermionField      Vphi(NumOp.FermionGrid());
    FermionField  MinvVphi(NumOp.FermionGrid());
    FermionField      tmp4(NumOp.GaugeGrid());
    FermionField  MdagInvMinvVphi(NumOp.FermionGrid());
    GaugeField   force(NumOp.GaugeGrid());	
    //Y=V phi
    //X = (Mdag V phi
    //Y = (Mdag M)^-1 Mdag V phi = M^-1 V Phi
    NumOp.ImportFourDimPseudoFermion(phi4,phi);
    NumOp.M(phi,Vphi);               //  V phi
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(DerivativeSolver);
    PrecSolve(DenOp,Vphi,MinvVphi);// M^-1 V phi
    std::cout << GridLogMessage << "4d deriv solve "<< norm2(MinvVphi)<<"\n";
    // Projects onto the physical space and back
    NumOp.ExportFourDimPseudoFermion(MinvVphi,tmp4);
    NumOp.ImportFourDimPseudoFermion(tmp4,Y);
    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecDagSolve(DerivativeDagSolver);
    // X = proj M^-dag V phi
    // Need an adjoint solve
    PrecDagSolve(DenOp,Y,MdagInvMinvVphi);
    std::cout << GridLogMessage << "4d deriv solve dag "<< norm2(MdagInvMinvVphi)<<"\n";
    // phi^dag (Vdag Mdag^-1) (M^-1 dV)  phi
    NumOp.MDeriv(force ,MdagInvMinvVphi , phi, DaggerNo );  dSdU=force;
    // phi^dag (dVdag Mdag^-1) (M^-1 V)  phi
    NumOp.MDeriv(force , phi, MdagInvMinvVphi ,DaggerYes  );  dSdU=dSdU+force;
    //    - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
    DenOp.MDeriv(force,MdagInvMinvVphi,MinvVphi,DaggerNo);   dSdU=dSdU-force;
    DenOp.MDeriv(force,MinvVphi,MdagInvMinvVphi,DaggerYes);  dSdU=dSdU-force;
    dSdU *= -1.0; 
    //dSdU = - Ta(dSdU);
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/Gparity.h
+++ b/Grid/qcd/gparity/Gparity.h
@ -1,6 +0,0 @@
 #ifndef GRID_GPARITY_H_
 #define GRID_GPARITY_H_
 #include<Grid/qcd/gparity/GparityFlavour.h>
 #endif
--- a/Grid/qcd/gparity/GparityFlavour.cc
+++ b/Grid/qcd/gparity/GparityFlavour.cc
@ -1,34 +0,0 @@
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
    GparityFlavour(GparityFlavour::Algebra::SigmaX),
    GparityFlavour(GparityFlavour::Algebra::SigmaY),
    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
    }};
 const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
  GparityFlavour(GparityFlavour::Algebra::Identity),
  GparityFlavour(GparityFlavour::Algebra::SigmaX),
  GparityFlavour(GparityFlavour::Algebra::SigmaY),
  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
 }};
 const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
    "SigmaX",
    "MinusSigmaX",
    "SigmaY",
    "MinusSigmaY",
    "SigmaZ",
    "MinusSigmaZ",
    "Identity",
    "MinusIdentity",
    "ProjPlus",
    "MinusProjPlus",
    "ProjMinus",
    "MinusProjMinus"}};
 NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/GparityFlavour.h
+++ b/Grid/qcd/gparity/GparityFlavour.h
@ -1,475 +0,0 @@
 #ifndef GRID_QCD_GPARITY_FLAVOUR_H
 #define GRID_QCD_GPARITY_FLAVOUR_H
 //Support for flavour-matrix operations acting on the G-parity flavour index
 #include <array>
 NAMESPACE_BEGIN(Grid);
 class GparityFlavour {
  public:
    GRID_SERIALIZABLE_ENUM(Algebra, undef,
                           SigmaX, 0,
 			   MinusSigmaX, 1,
                           SigmaY, 2,
 			   MinusSigmaY, 3,
                           SigmaZ, 4,
 			   MinusSigmaZ, 5,
 			   Identity, 6,
 			   MinusIdentity, 7,
 			   ProjPlus, 8,
 			   MinusProjPlus, 9,
 			   ProjMinus, 10,
 			   MinusProjMinus, 11
 			   );
    static constexpr unsigned int nSigma = 12;
    static const std::array<const char *, nSigma>                name;
    static const std::array<const GparityFlavour, 3>             sigma_mu;
    static const std::array<const GparityFlavour, 6>            sigma_all;
    Algebra                                                      g;
  public:
  accelerator GparityFlavour(Algebra initg): g(initg) {}  
 };
 // 0 1  x   vector
 // 1 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(1);
  ret(1) = rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(1,0);
  ret(0,1) = rhs(1,1);
  ret(1,0) = rhs(0,0);
  ret(1,1) = rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,1);
  ret(0,1) = rhs(0,0);
  ret(1,0) = rhs(1,1);
  ret(1,1) = rhs(1,0);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(1);
  ret(1) = -rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(1,0);
  ret(0,1) = -rhs(1,1);
  ret(1,0) = -rhs(0,0);
  ret(1,1) = -rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,1);
  ret(0,1) = -rhs(0,0);
  ret(1,0) = -rhs(1,1);
  ret(1,1) = -rhs(1,0);
 };
 // 0 -i  x   vector
 // i 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesMinusI(rhs(1));
  ret(1) = timesI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(1,0));
  ret(0,1) = timesMinusI(rhs(1,1));
  ret(1,0) = timesI(rhs(0,0));
  ret(1,1) = timesI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(0,1));
  ret(0,1) = timesMinusI(rhs(0,0));
  ret(1,0) = timesI(rhs(1,1));
  ret(1,1) = timesMinusI(rhs(1,0));
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesI(rhs(1));
  ret(1) = timesMinusI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(1,0));
  ret(0,1) = timesI(rhs(1,1));
  ret(1,0) = timesMinusI(rhs(0,0));
  ret(1,1) = timesMinusI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(0,1));
  ret(0,1) = timesI(rhs(0,0));
  ret(1,0) = timesMinusI(rhs(1,1));
  ret(1,1) = timesI(rhs(1,0));
 };
 // 1 0  x   vector
 // 0 -1
 template<class vtype>
 accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 //G-parity flavour projection 1/2(1+\sigma_2)
 //1 -i
 //i  1
 template<class vtype>
 accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 //G-parity flavour projection 1/2(1-\sigma_2)
 //1 i
 //-i  1
 template<class vtype>
 accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
 {
  iVector<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    multFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    multFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    multFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    multFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    multFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    multFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    multFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    multFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    multFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    multFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    multFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    multFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    lmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    lmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    lmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    lmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    lmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    lmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    lmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    lmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    lmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    lmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    lmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    lmultFlavourMinusProjMinus(ret, arg); break;  
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    rmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    rmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    rmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    rmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    rmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    rmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    rmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    rmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    rmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    rmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    rmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    rmultFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 NAMESPACE_END(Grid);
 #endif // include guard
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@ -129,10 +129,18 @@ public:
    Runner(S);
  }
-  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
+  //////////////////////////////////////////////////////////////////
-  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
+
-  void initializeGaugeFieldAndRNGs(Field &U){
+private:
-    if(!Resources.haveRNGs()) Resources.AddRNGs();
+  template <class SmearingPolicy>
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Resources.AddRNGs();
    Field U(UGrid);
    // Can move this outside?
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    if (Parameters.StartingType == "HotStart") {
      // Hot start
@ -159,25 +167,6 @@ public:
 	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
      exit(1);
    }
  }
  //////////////////////////////////////////////////////////////////
 private:
  template <class SmearingPolicy>
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Field U(UGrid);
    initializeGaugeFieldAndRNGs(U);
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    // Sets the momentum filter
    MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter()));
    Smearing.set_Field(U);
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@ -34,7 +34,6 @@ directory
 			    * @brief Classes for Hybrid Monte Carlo update
 			    *
 			    * @author Guido Cossu
 			    * @author Peter Boyle
 			    */
 			   //--------------------------------------------------------------------
 #pragma once
@ -116,17 +115,22 @@ private:
    random(sRNG, rn_test);
-    std::cout << GridLogHMC << "--------------------------------------------------\n";
+    std::cout << GridLogMessage
-    std::cout << GridLogHMC << "exp(-dH) = " << prob << "  Random = " << rn_test << "\n";
+              << "--------------------------------------------------\n";
-    std::cout << GridLogHMC << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
+    std::cout << GridLogMessage << "exp(-dH) = " << prob
              << "  Random = " << rn_test << "\n";
    std::cout << GridLogMessage
              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogHMC << "--------------------------------------------------\n";
+      std::cout << GridLogMessage
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogHMC << "--------------------------------------------------\n";
+      std::cout << GridLogMessage
                << "--------------------------------------------------\n";
      return false;
    }
  }
@ -135,68 +139,19 @@ private:
  // Evolution
  /////////////////////////////////////////////////////////
  RealD evolve_hmc_step(Field &U) {
    TheIntegrator.refresh(U, sRNG, pRNG);  // set U and initialize P and phi's
-    GridBase *Grid = U.Grid();
+    RealD H0 = TheIntegrator.S(U);  // initial state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // Mainly for DDHMC perform a random translation of U modulo volume
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Random shifting gauge field by [";
    for(int d=0;d<Grid->Nd();d++) {
      int L = Grid->GlobalDimensions()[d];
      RealD rn_uniform;  random(sRNG, rn_uniform);
      int shift = (int) (rn_uniform*L);
      std::cout << shift;
      if(d<Grid->Nd()-1) std::cout <<",";
      else               std::cout <<"]\n";
      U = Cshift(U,d,shift);
    }
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    TheIntegrator.reset_timer();
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // set U and initialize P and phi's
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Refresh momenta and pseudofermions";
    TheIntegrator.refresh(U, sRNG, pRNG);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // initial state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Compute initial action";
    RealD H0 = TheIntegrator.S(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << " Molecular Dynamics evolution ";
    TheIntegrator.integrate(U);
    std::cout << GridLogMessage << "--------------------------------------------------\n";
-    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    RealD H1 = TheIntegrator.S(U);  // updated state action
    // updated state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Compute final action";
    RealD H1 = TheIntegrator.S(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    ///////////////////////////////////////////////////////////
    if(0){
      std::cout << "------------------------- Reversibility test" << std::endl;
@ -208,16 +163,17 @@ private:
    }
    ///////////////////////////////////////////////////////////
    std::cout.precision(15);
-
+    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
-    std::cout << GridLogHMC << "--------------------------------------------------\n";
+	      << "  dH = " << H1 - H0 << "\n";
    std::cout << GridLogHMC << "Total H after trajectory  = " << H1 << "  dH = " << H1 - H0 << "\n";
    std::cout << GridLogHMC << "--------------------------------------------------\n";
    std::cout.precision(current_precision);
    return (H1 - H0);
  }
 public:
  /////////////////////////////////////////
@ -239,13 +195,10 @@ public:
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-
+      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
+      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
      }
      double t0=usecond();
@ -254,19 +207,20 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
      }
      if (accept)
        Ucur = Ucopy; 
      double t1=usecond();
-      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
      TheIntegrator.print_timer();
      for (int obs = 0; obs < Observables.size(); obs++) {
      	std::cout << GridLogDebug << "Observables # " << obs << std::endl;
@ -274,7 +228,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }
--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@ -72,8 +72,6 @@ class HMCResourceManager {
  typedef HMCModuleBase< BaseHmcCheckpointer<ImplementationPolicy> > CheckpointerBaseModule;
  typedef HMCModuleBase< HmcObservable<typename ImplementationPolicy::Field> > ObservableBaseModule;
  typedef ActionModuleBase< Action<typename ImplementationPolicy::Field>, GridModule > ActionBaseModule;
  typedef typename ImplementationPolicy::Field MomentaField;
  typedef typename ImplementationPolicy::Field Field;  
  // Named storage for grid pairs (std + red-black)
  std::unordered_map<std::string, GridModule> Grids;
@ -82,9 +80,6 @@ class HMCResourceManager {
  // SmearingModule<ImplementationPolicy> Smearing;
  std::unique_ptr<CheckpointerBaseModule> CP;
  // Momentum filter
  std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> > Filter;
  // A vector of HmcObservable modules
  std::vector<std::unique_ptr<ObservableBaseModule> > ObservablesList;
@ -95,7 +90,6 @@ class HMCResourceManager {
  bool have_RNG;
  bool have_CheckPointer;
  bool have_Filter;
  // NOTE: operator << is not overloaded for std::vector<string> 
  // so this function is necessary
@ -107,7 +101,7 @@ class HMCResourceManager {
 public:
-  HMCResourceManager() : have_RNG(false), have_CheckPointer(false), have_Filter(false) {}
+  HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {}
  template <class ReaderClass, class vector_type = vComplex >
  void initialize(ReaderClass &Read){
@ -135,7 +129,6 @@ public:
    RNGModuleParameters RNGpar(Read);
    SetRNGSeeds(RNGpar);
    // Observables
    auto &ObsFactory = HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance(); 
    Read.push(observable_string);// here must check if existing...
@ -215,16 +208,6 @@ public:
    AddGrid(s, Mod);
  }
  void SetMomentumFilter( MomentumFilterBase<typename ImplementationPolicy::Field> * MomFilter) {
    assert(have_Filter==false);
    Filter = std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> >(MomFilter);
    have_Filter = true;
  }
  MomentumFilterBase<typename ImplementationPolicy::Field> *GetMomentumFilter(void) {
    if ( !have_Filter)
      SetMomentumFilter(new MomentumFilterNone<typename ImplementationPolicy::Field>());
    return Filter.get();
  }
  GridCartesian* GetCartesian(std::string s = "") {
    if (s.empty()) s = Grids.begin()->first;
@ -243,9 +226,6 @@ public:
  //////////////////////////////////////////////////////
  // Random number generators
  //////////////////////////////////////////////////////
  //Return true if the RNG objects have been instantiated
  bool haveRNGs() const{ return have_RNG; }
  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -33,6 +33,7 @@ directory
 #define INTEGRATOR_INCLUDED
 #include <memory>
 #include "MomentumFilter.h"
 NAMESPACE_BEGIN(Grid);
@ -66,7 +67,6 @@ public:
 template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy>
 class Integrator {
 protected:
  typedef typename FieldImplementation::Field MomentaField;  //for readability
  typedef typename FieldImplementation::Field Field;
@ -119,58 +119,36 @@ protected:
    }
  } update_P_hireps{};
  void update_P(MomentaField& Mom, Field& U, int level, double ep) {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing
    for (int a = 0; a < as[level].actions.size(); ++a) {
      double start_full = usecond();
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();
      as[level].actions.at(a)->deriv_timer_start();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();
      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      auto name = as[level].actions.at(a)->action_name();
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-
+      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      MomFilter->applyFilter(force);
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl;
      //      DumpSliceNorm("force ",force,Nd-1);
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real force_max   = std::sqrt(maxLocalNorm2(force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
      as[level].actions.at(a)->deriv_log(force_abs,force_max);
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average  : " << impulse_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max      : " << impulse_max <<" "<<name<<std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
    }
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
    MomFilter->applyFilter(Mom);
  }
  void update_U(Field& U, double ep) 
@ -184,12 +162,8 @@ protected:
  void update_U(MomentaField& Mom, Field& U, double ep) 
  {
    MomentaField MomFiltered(Mom.Grid());
    MomFiltered = Mom;
    MomFilter->applyFilter(MomFiltered);
    // exponential of Mom*U in the gauge fields case
-    FieldImplementation::update_field(MomFiltered, U, ep);
+    FieldImplementation::update_field(Mom, U, ep);
    // Update the smeared fields, can be implemented as observer
    Smearer.set_Field(U);
@ -232,66 +206,6 @@ public:
  const MomentaField & getMomentum() const{ return P; }
  void reset_timer(void)
  {
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        as[level].actions.at(actionID)->reset_timer();
      }
    }
  }
  void print_timer(void)
  {
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::" << std::endl;
    std::cout << GridLogMessage << " Refresh cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->refresh_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Action cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->S_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Force cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Force average size "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] : "
 		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
  void print_parameters()
  {
    std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
@ -310,6 +224,7 @@ public:
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
  void reverse_momenta()
@ -352,19 +267,15 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
 	auto name = as[level].actions.at(actionID)->action_name();
        std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
 	as[level].actions.at(actionID)->refresh_timer_start();
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
 	as[level].actions.at(actionID)->refresh_timer_stop();
      }
      // Refresh the higher representation actions
      as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
    }
    MomFilter->applyFilter(P);
  }
  // to be used by the actionlevel class to iterate
@ -399,9 +310,7 @@ public:
        // based on the boolean is_smeared in actionID
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 	        as[level].actions.at(actionID)->S_timer_start();
        Hterm = as[level].actions.at(actionID)->S(Us);
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
      }
--- a/Grid/qcd/hmc/integrators/MomentumFilter.h
+++ b/Grid/qcd/hmc/integrators/MomentumFilter.h
@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct MomentumFilterBase{
-  virtual void applyFilter(MomentaField &P) const = 0;
+  virtual void applyFilter(MomentaField &P) const;
 };
 //Do nothing
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@ -55,12 +55,12 @@ public:
    }
  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
-    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
+    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
  }
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
    GridBase *grid = Umu.Grid();
@ -122,6 +122,8 @@ public:
      }
    }
    std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
    if (err_on_no_converge) assert(0);
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -125,7 +125,6 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
@ -165,7 +164,7 @@ public:
    double vol = Umu.Grid()->gSites();
-    return p.real() / vol / 4.0 / 3.0;
+    return p.real() / vol / (4.0 * Nc ) ;
  };
  //////////////////////////////////////////////////
--- a/Grid/serialisation/JSON_IO.cc
+++ b/Grid/serialisation/JSON_IO.cc
@ -26,7 +26,7 @@
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#ifndef GRID_HIP
 NAMESPACE_BEGIN(Grid);
@ -82,7 +82,7 @@ void JSONWriter::writeDefault(const std::string &s,	const std::string &x)
  if (s.size())
    ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ;
  else
-    ss_ << os.str() << " ," ;
+    ss_ << "\""<< os.str() << "\" ," ;
 }
 // Reader implementation ///////////////////////////////////////////////////////
--- a/Grid/serialisation/JSON_IO.h
+++ b/Grid/serialisation/JSON_IO.h
@ -54,7 +54,7 @@ namespace Grid
    void pop(void);
    template <typename U>
    void writeDefault(const std::string &s, const U &x);
-#ifdef __NVCC__
+#if defined(GRID_CUDA) || defined(GRID_HIP)
    void writeDefault(const std::string &s, const Grid::ComplexD &x) 
    { 
      std::complex<double> z(real(x),imag(x));
@ -101,7 +101,7 @@ namespace Grid
    void readDefault(const std::string &s, std::vector<U> &output);
    template <typename U, typename P>
    void readDefault(const std::string &s, std::pair<U,P> &output);
-#ifdef __NVCC__
+#if defined(GRID_CUDA) || defined(GRID_HIP)
    void readDefault(const std::string &s, ComplexD &output)
    { 
      std::complex<double> z;
--- a/Grid/serialisation/Serialisation.h
+++ b/Grid/serialisation/Serialisation.h
@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "BinaryIO.h"
 #include "TextIO.h"
 #include "XmlIO.h"
-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#ifndef GRID_HIP
 #include "JSON_IO.h"
 #endif
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
 ///////////////////////////////////////////////////////////////////
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
-				 commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
+				 commVector<cobj *> pointers,
 				 int dimension,int plane,
 				 int cbmask,compressor &compress,int type) __attribute__((noinline));
 template<class cobj,class vobj,class compressor>
-void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
+void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
-				 Vector<cobj *> pointers,int dimension,int plane,int cbmask,
+				 const Lattice<vobj> &rhs,
 				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
 				 compressor &compress,int type)
 {
  assert( (table.size()&0x1)==0);
@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
  auto rhs_v = rhs.View(AcceleratorRead);
  auto rhs_p = &rhs_v[0];
  auto p0=&pointers[0][0];
  auto p1=&pointers[1][0];
  auto tp=&table[0];
  accelerator_forNB(j, num, vobj::Nsimd(), {
-      compress.CompressExchange(p0,p1, &rhs_v[0], j,
+      compress.CompressExchange(p0,p1, rhs_p, j,
-			      so+tp[2*j  ].second,
+				so+tp[2*j  ].second,
-			      so+tp[2*j+1].second,
+				so+tp[2*j+1].second,
-			      type);
+				type);
  });
  rhs_v.ViewClose();
 }
@ -131,11 +135,8 @@ class CartesianStencilAccelerator {
  int           _checkerboard;
  int           _npoints; // Move to template param?
  int           _osites;
  int           _dirichlet;
  StencilVector _directions;
  StencilVector _distances;
  StencilVector _comms_send;
  StencilVector _comms_recv;
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
  StencilVector same_node;
@ -229,14 +230,12 @@ public:
    void * recv_buf;
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
    Integer do_recv;
    Integer bytes;
  };
  struct Merge {
    cobj * mpointer;
-    Vector<scalar_object *> rpointers;
+    //    std::vector<scalar_object *> rpointers;
-    Vector<cobj *> vpointers;
+    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
  };
@ -245,20 +244,7 @@ public:
    cobj * mpi_p;
    Integer buffer_size;
  };
-  struct CopyReceiveBuffer {
+
    void * from_p;
    void * to_p;
    Integer bytes;
  };
  struct CachedTransfer {
    Integer direction;
    Integer OrthogPlane;
    Integer DestProc;
    Integer bytes;
    Integer lane;
    Integer cb;
    void *recv_buf;
  };
 protected:
  GridBase *                        _grid;
@ -289,8 +275,7 @@ public:
  std::vector<Merge> MergersSHM;
  std::vector<Decompress> Decompressions;
  std::vector<Decompress> DecompressionsSHM;
-  std::vector<CopyReceiveBuffer> CopyReceiveBuffers ;
+
  std::vector<CachedTransfer> CachedTransfers;
  ///////////////////////////////////////////////////////////
  // Unified Comms buffers for all directions
  ///////////////////////////////////////////////////////////
@ -303,6 +288,29 @@ public:
  int u_comm_offset;
  int _unified_buffer_size;
  /////////////////////////////////////////
  // Timing info; ugly; possibly temporary
  /////////////////////////////////////////
  double commtime;
  double mpi3synctime;
  double mpi3synctime_g;
  double shmmergetime;
  double gathertime;
  double gathermtime;
  double halogtime;
  double mergetime;
  double decompresstime;
  double comms_bytes;
  double shm_bytes;
  double splicetime;
  double nosplicetime;
  double calls;
  std::vector<double> comm_bytes_thr;
  std::vector<double> shm_bytes_thr;
  std::vector<double> comm_time_thr;
  std::vector<double> comm_enter_thr;
  std::vector<double> comm_leave_thr;
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
@ -329,12 +337,11 @@ public:
  //////////////////////////////////////////
  // Comms packet queue for asynch thread
  // Use OpenMP Tasks for cleaner ???
  // must be called *inside* parallel region
  //////////////////////////////////////////
  /*
  void CommunicateThreaded()
  {
 #ifdef GRID_OMP
    // must be called in parallel region
    int mythread = omp_get_thread_num();
    int nthreads = CartesianCommunicator::nCommThreads;
 #else
@ -343,30 +350,67 @@ public:
 #endif
    if (nthreads == -1) nthreads = 1;
    if (mythread < nthreads) {
      comm_enter_thr[mythread] = usecond();
      for (int i = mythread; i < Packets.size(); i += nthreads) {
 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 						      Packets[i].to_rank,
 						      Packets[i].recv_buf,
 						      Packets[i].from_rank,
 						      Packets[i].bytes,i);
 	comm_bytes_thr[mythread] += bytes;
 	shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv.
      }
      comm_leave_thr[mythread]= usecond();
      comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
    }
  }
-  */
+
  void CollateThreads(void)
  {
    int nthreads = CartesianCommunicator::nCommThreads;
    double first=0.0;
    double last =0.0;
    for(int t=0;t<nthreads;t++) {
      double t0 = comm_enter_thr[t];
      double t1 = comm_leave_thr[t];
      comms_bytes+=comm_bytes_thr[t];
      shm_bytes  +=shm_bytes_thr[t];
      comm_enter_thr[t] = 0.0;
      comm_leave_thr[t] = 0.0;
      comm_time_thr[t]   = 0.0;
      comm_bytes_thr[t]=0;
      shm_bytes_thr[t]=0;
      if ( first == 0.0 ) first = t0;                   // first is t0
      if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen
      if ( t1 > last ) last = t1;                       // max time seen
    }
    commtime+= last-first;
  }
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromBegin(reqs[i],
+      uint64_t bytes=_grid->StencilSendToRecvFromBegin(reqs[i],
-					Packets[i].send_buf,
+						     Packets[i].send_buf,
-					Packets[i].to_rank,Packets[i].do_send,
+						     Packets[i].to_rank,
-					Packets[i].recv_buf,
+						     Packets[i].recv_buf,
-					Packets[i].from_rank,Packets[i].do_recv,
+						     Packets[i].from_rank,
-					Packets[i].bytes,i);
+						     Packets[i].bytes,i);
      comms_bytes+=bytes;
      shm_bytes  +=2*Packets[i].bytes-bytes;
    }
    _grid->StencilBarrier();// Synch shared memory on a single nodes
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@ -374,34 +418,36 @@ public:
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromComplete(reqs[i],i);
    }
    commtime+=usecond();
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
+    if ( 0 ){
-      /////////////////////////////////////////////////////////
+      thread_region {
-      // several way threaded on different communicators.
+	// must be called in parallel region
-      // Cannot combine with Dirichlet operators
+	int mythread  = thread_num();
-      // This scheme is needed on Intel Omnipath for best performance
+	int maxthreads= thread_max();
-      // Deprecate once there are very few omnipath clusters
+	int nthreads = CartesianCommunicator::nCommThreads;
-      /////////////////////////////////////////////////////////
+	assert(nthreads <= maxthreads);
-      int nthreads = CartesianCommunicator::nCommThreads;
+	if (nthreads == -1) nthreads = 1;
-      int old = GridThread::GetThreads();
+	if (mythread < nthreads) {
-      GridThread::SetThreads(nthreads);
+	  for (int i = mythread; i < Packets.size(); i += nthreads) {
-      thread_for(i,Packets.size(),{
+	    double start = usecond();
-	  _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+	    uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
-				       Packets[i].to_rank,Packets[i].do_send,
+							 Packets[i].to_rank,
-				       Packets[i].recv_buf,
+							 Packets[i].recv_buf,
-				       Packets[i].from_rank,Packets[i].do_recv,
+							 Packets[i].from_rank,
-				       Packets[i].bytes,i);
+							 Packets[i].bytes,i);
-      });
+	    comm_bytes_thr[mythread] += bytes;
-      GridThread::SetThreads(old);
+	    shm_bytes_thr[mythread]  += Packets[i].bytes - bytes;
-    } else { 
+	    comm_time_thr[mythread]  += usecond() - start;
-      /////////////////////////////////////////////////////////
+	  }
-      // Concurrent and non-threaded asynch calls to MPI
+	}
-      /////////////////////////////////////////////////////////
+      }
    } else { // Concurrent and non-threaded asynch calls to MPI
      std::vector<std::vector<CommsRequest_t> > reqs;
      this->CommunicateBegin(reqs);
      this->CommunicateComplete(reqs);
@ -443,23 +489,31 @@ public:
      sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
      if ( sshift[0] == sshift[1] ) {
 	if (splice_dim) {
-	  auto tmp  = GatherSimd(source,dimension,shift,0x3,compress,face_idx,point);
+	  splicetime-=usecond();
 	  auto tmp  = GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 	  is_same_node = is_same_node && tmp;
 	  splicetime+=usecond();
 	} else {
-	  auto tmp  = Gather(source,dimension,shift,0x3,compress,face_idx,point);
+	  nosplicetime-=usecond();
 	  auto tmp  = Gather(source,dimension,shift,0x3,compress,face_idx);
 	  is_same_node = is_same_node && tmp;
 	  nosplicetime+=usecond();
 	}
      } else {
 	if(splice_dim){
 	  splicetime-=usecond();
 	  // if checkerboard is unfavourable take two passes
 	  // both with block stride loop iteration
-	  auto tmp1 =  GatherSimd(source,dimension,shift,0x1,compress,face_idx,point);
+	  auto tmp1 =  GatherSimd(source,dimension,shift,0x1,compress,face_idx);
-	  auto tmp2 =  GatherSimd(source,dimension,shift,0x2,compress,face_idx,point);
+	  auto tmp2 =  GatherSimd(source,dimension,shift,0x2,compress,face_idx);
 	  is_same_node = is_same_node && tmp1 && tmp2;
 	  splicetime+=usecond();
 	} else {
-	  auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx,point);
+	  nosplicetime-=usecond();
-	  auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx,point);
+	  auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx);
 	  auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx);
 	  is_same_node = is_same_node && tmp1 && tmp2;
 	  nosplicetime+=usecond();
 	}
      }
    }
@ -469,10 +523,13 @@ public:
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    mpi3synctime_g-=usecond();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime_g+=usecond();
    // conformable(source.Grid(),_grid);
    assert(source.Grid()==_grid);
    halogtime-=usecond();
    u_comm_offset=0;
@ -486,6 +543,7 @@ public:
    assert(u_comm_offset==_unified_buffer_size);
    accelerator_barrier();
    halogtime+=usecond();
  }
  /////////////////////////
@ -498,72 +556,14 @@ public:
    Mergers.resize(0);
    MergersSHM.resize(0);
    Packets.resize(0);
-    CopyReceiveBuffers.resize(0);
+    calls++;
    CachedTransfers.resize(0);
  }
-  void AddCopy(void *from,void * to, Integer bytes)
+  void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
  {
    //    std::cout << "Adding CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<bytes<<std::endl;
    CopyReceiveBuffer obj;
    obj.from_p = from;
    obj.to_p = to;
    obj.bytes= bytes;
    CopyReceiveBuffers.push_back(obj);
  }
  void CommsCopy()
  {
    //    These are device resident MPI buffers.
    for(int i=0;i<CopyReceiveBuffers.size();i++){
      cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
      cobj *to  =(cobj *)CopyReceiveBuffers[i].to_p;
      Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
      //    std::cout << "CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<words*sizeof(cobj)<<std::endl;
      accelerator_forNB(j, words, cobj::Nsimd(), {
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
    }
  }
  Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,Integer bytes,Integer cb)
  {
    CachedTransfer obj;
    obj.direction   = direction;
    obj.OrthogPlane = OrthogPlane;
    obj.DestProc    = DestProc;
    obj.recv_buf    = recv_buf;
    obj.lane        = lane;
    obj.bytes       = bytes;
    obj.cb          = cb;
    for(int i=0;i<CachedTransfers.size();i++){
      if (   (CachedTransfers[i].direction  ==direction)
 	   &&(CachedTransfers[i].OrthogPlane==OrthogPlane)
 	   &&(CachedTransfers[i].DestProc   ==DestProc)
 	   &&(CachedTransfers[i].bytes      ==bytes)
 	   &&(CachedTransfers[i].lane       ==lane)
 	   &&(CachedTransfers[i].cb         ==cb)
 	     ){
 	//	std::cout << "Found duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<< " bytes "<<bytes <<std::endl;
 	AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
 	return 1;
      }
    }
    //    std::cout << "No duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<<"  bytes "<<bytes<<std::endl;
    CachedTransfers.push_back(obj);
    return 0;
  }
  void AddPacket(void *xmit,void * rcv,
 		 Integer to, Integer do_send,
 		 Integer from, Integer do_recv,
 		 Integer bytes){
    Packet p;
    p.send_buf = xmit;
    p.recv_buf = rcv;
    p.to_rank  = to;
    p.from_rank= from;
    p.do_send  = do_send;
    p.do_recv  = do_recv;
    p.bytes    = bytes;
    Packets.push_back(p);
  }
@ -574,7 +574,7 @@ public:
    d.buffer_size = buffer_size;
    dv.push_back(d);
  }
-  void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
+  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
    m.type     = type;
    m.mpointer = merge_p;
@ -583,17 +583,23 @@ public:
    mv.push_back(m);
  }
  template<class decompressor>  void CommsMerge(decompressor decompress)    {
    CommsCopy();
    CommsMerge(decompress,Mergers,Decompressions);
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
    mpi3synctime-=usecond();
    accelerator_barrier();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime+=usecond();
    shmmergetime-=usecond();
    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
    shmmergetime+=usecond();
  }
  template<class decompressor>
-  void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd)
+  void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd) {
-  {
+
    mergetime-=usecond();
    for(int i=0;i<mm.size();i++){
      auto mp = &mm[i].mpointer[0];
      auto vp0= &mm[i].vpointers[0][0];
@ -603,7 +609,9 @@ public:
 	  decompress.Exchange(mp,vp0,vp1,type,o);
      });
    }
    mergetime+=usecond();
    decompresstime-=usecond();
    for(int i=0;i<dd.size();i++){
      auto kp = dd[i].kernel_p;
      auto mp = dd[i].mpi_p;
@ -611,6 +619,7 @@ public:
 	decompress.Decompress(kp,mp,o);
      });
    }
    decompresstime+=usecond();
  }
  ////////////////////////////////////////
  // Set up routines
@ -647,58 +656,19 @@ public:
      }
    }
  }
-  /// Introduce a block structure and switch off comms on boundaries
+
  void DirichletBlock(const Coordinate &dirichlet_block)
  {
    this->_dirichlet = 1;
    for(int ii=0;ii<this->_npoints;ii++){
      int dimension    = this->_directions[ii];
      int displacement = this->_distances[ii];
      int shift = displacement;
      int gd = _grid->_gdimensions[dimension];
      int fd = _grid->_fdimensions[dimension];
      int pd = _grid->_processors [dimension];
      int ld = gd/pd;
      int pc = _grid->_processor_coor[dimension];
      ///////////////////////////////////////////
      // Figure out dirichlet send and receive
      // on this leg of stencil.
      ///////////////////////////////////////////
      int comm_dim        = _grid->_processors[dimension] >1 ;
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
      if ( block ) {
 	assert(abs(displacement) < ld );
 	if( displacement > 0 ) {
 	  // High side, low side
 	  // | <--B--->|
 	  // |    |    |
 	  //           noR
 	  // noS
 	  if ( (ld*(pc+1) ) % block == 0 ) this->_comms_recv[ii] = 0;
 	  if ( ( ld*pc ) % block == 0    ) this->_comms_send[ii] = 0;
 	} else {
 	  // High side, low side
 	  // | <--B--->|
 	  // |    |    |
 	  //           noS
 	  // noR
 	  if ( (ld*(pc+1) ) % block == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( ld*pc ) % block    == 0 ) this->_comms_recv[ii] = 0;
 	}
      }
    }
  }
  CartesianStencil(GridBase *grid,
 		   int npoints,
 		   int checkerboard,
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
 		   Parameters p)
    : shm_bytes_thr(npoints),
      comm_bytes_thr(npoints),
      comm_enter_thr(npoints),
      comm_leave_thr(npoints),
      comm_time_thr(npoints)
  {
    this->_dirichlet = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@ -711,8 +681,6 @@ public:
    this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels
    this->_directions = StencilVector(directions);
    this->_distances  = StencilVector(distances);
    this->_comms_send.resize(npoints); 
    this->_comms_recv.resize(npoints); 
    this->same_node.resize(npoints);
    _unified_buffer_size=0;
@ -731,27 +699,24 @@ public:
      int displacement = distances[i];
      int shift = displacement;
      int gd = _grid->_gdimensions[dimension];
      int fd = _grid->_fdimensions[dimension];
      int pd = _grid->_processors [dimension];
      int ld = gd/pd;
      int rd = _grid->_rdimensions[dimension];
      int pc = _grid->_processor_coor[dimension];
      this->_permute_type[point]=_grid->PermuteType(dimension);
      this->_checkerboard = checkerboard;
      //////////////////////////
      // the permute type
      //////////////////////////
      int simd_layout     = _grid->_simd_layout[dimension];
      int comm_dim        = _grid->_processors[dimension] >1 ;
      int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
      int rotate_dim      = _grid->_simd_layout[dimension]>2;
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
      assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
      int sshift[2];
      //////////////////////////
      // Underlying approach. For each local site build
      // up a table containing the npoint "neighbours" and whether they
@ -852,7 +817,6 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();
    int comms_recv      = this->_comms_recv[point];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@ -909,9 +873,7 @@ public:
      if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
 	wraparound = 1;
      }
-
+      if (!offnode) {
      // Wrap locally dirichlet support case OR node local
      if ( (offnode==0) || (comms_recv==0)  ) {
 	int permute_slice=0;
 	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
@ -1028,14 +990,11 @@ public:
  }
  template<class compressor>
-  int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point)
+  int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
  {
    typedef typename cobj::vector_type vector_type;
    typedef typename cobj::scalar_type scalar_type;
    int comms_send   = this->_comms_send[point] ;
    int comms_recv   = this->_comms_recv[point] ;
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@ -1058,11 +1017,9 @@ public:
      int sx        = (x+sshift)%rd;
      int comm_proc = ((x+sshift)/rd)%pd;
-      
+
      if (comm_proc) {
 	int words = buffer_size;
 	if (cbmask != 0x3) words=words>>1;
@ -1094,53 +1051,44 @@ public:
 	  recv_buf=this->u_recv_buf_p;
 	}
 	cobj *send_buf;
 	send_buf = this->u_send_buf_p; // Gather locally, must send
-	
+
 	////////////////////////////////////////////////////////
 	// Gather locally
 	////////////////////////////////////////////////////////
 	gathertime-=usecond();
 	assert(send_buf!=NULL);
-	if ( comms_send ) 
+	Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
-	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);
+	gathertime+=usecond();
 	face_idx++;
-	int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask);
+	///////////////////////////////////////////////////////////
-	if ( (!duplicate) ) { // Force comms for now
+	// Build a list of things to do after we synchronise GPUs
 	// Start comms now???
 	///////////////////////////////////////////////////////////
 	AddPacket((void *)&send_buf[u_comm_offset],
 		  (void *)&recv_buf[u_comm_offset],
 		  xmit_to_rank,
 		  recv_from_rank,
 		  bytes);
-	  ///////////////////////////////////////////////////////////
+	if ( compress.DecompressionStep() ) {
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
 	  AddPacket((void *)&send_buf[u_comm_offset],
 		    (void *)&recv_buf[u_comm_offset],
 		    xmit_to_rank, comms_send,
 		    recv_from_rank, comms_recv,
 		    bytes);
 	}
 	if ( compress.DecompressionStep()  ) {
 	  AddDecompress(&this->u_recv_buf_p[u_comm_offset],
 			&recv_buf[u_comm_offset],
 			words,Decompressions);
 	}
 	u_comm_offset+=words;
-	}
+      }
    }
    return 0;
  }
  template<class compressor>
-  int  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx,int point)
+  int  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
  {
    const int Nsimd = _grid->Nsimd();
    const int maxl =2;// max layout in a direction
    int comms_send   = this->_comms_send[point] ;
    int comms_recv   = this->_comms_recv[point] ;
    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
    int ld = _grid->_ldimensions[dimension];
@ -1172,8 +1120,8 @@ public:
    int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
    assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
-    Vector<cobj *> rpointers(maxl);
+    std::vector<cobj *> rpointers(maxl);
-    Vector<cobj *> spointers(maxl);
+    std::vector<cobj *> spointers(maxl);
    ///////////////////////////////////////////
    // Work out what to send where
@ -1205,11 +1153,12 @@ public:
 				  &face_table[face_idx][0],
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
 	}
 	gathermtime-=usecond();
 	//	if ( comms_send )
 	Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 	face_idx++;
 	gathermtime+=usecond();
 	//spointers[0] -- low
 	//spointers[1] -- high
@ -1238,13 +1187,8 @@ public:
 	    rpointers[i] = rp;
-	    int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask);
+	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
-	    if ( !duplicate  ) { 
+
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,comms_send,
 			recv_from_rank,comms_recv,
 			bytes);
 	    }
 	  } else {
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
 // Specialisation: Cayley-Hamilton exponential for SU(3)
-#ifndef GRID_ACCELERATED
+#ifndef GRID_CUDA
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
 accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
 {
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -195,12 +195,15 @@ void acceleratorInit(void)
 #ifdef GRID_SYCL
 cl::sycl::queue *theGridAccelerator;
 cl::sycl::queue *theCopyAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (selectedDevice);
  //  theCopyAccelerator = new sycl::queue (selectedDevice);
  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
  zeInit(0);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	97a098636d	FermToProp	2022-11-30 15:36:35 -05:00
Peter Boyle	e13930c8b2	Faster fermtoprop case	2022-11-30 15:11:29 -05:00
Peter Boyle	0655dab466	Open MP on host enabled	2022-11-08 13:38:54 -08:00
Peter Boyle	7f097bcc28	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2022-11-08 13:23:40 -08:00
Peter Boyle	5c75aa5008	Device mem	2022-11-08 13:22:57 -08:00
Peter Boyle	1873101362	PVC	2022-11-08 13:22:45 -08:00
Peter Boyle	63fd1dfa62	Config on PVC	2022-11-08 13:22:09 -08:00
Peter Boyle	bd68861b28	SYCL sum	2022-11-08 12:49:26 -08:00
Peter Boyle	82e959f66c	SYCL reduction	2022-11-08 12:45:25 -08:00
Peter Boyle	62e52de06d	Merge pull request #414 from fjosw/feat/eCloverGPU Compact Exponential Cloverterm on GPU	2022-11-01 09:15:44 -04:00
Fabian Joswig	184adeedb8	feat: renamed open_boundaries to fixedBoundaries	2022-10-26 12:53:46 +01:00
Fabian Joswig	5fa6a8b96d	docs: CompactClover debug info generalized.	2022-10-26 12:41:14 +01:00
Fabian Joswig	a2a879b668	docs: CompactClover Debug Info improved.	2022-10-25 17:20:42 +01:00
Fabian Joswig	9317d893b2	docs: details about inversion of CompactClover term added.	2022-10-25 17:10:06 +01:00
Fabian Joswig	86075fdd45	feat: MassTerm and ExponentiateClover merged into InstantiateClover	2022-10-25 17:05:34 +01:00
Fabian Joswig	b36442e263	feat: CloverHelpers::InvertClover implemented which handles the inversion of the Clover term depending on clover type and the boundary conditions.	2022-10-25 16:57:01 +01:00
Fabian Joswig	513d797ea6	fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed.	2022-10-25 16:17:22 +01:00
Fabian Joswig	9e4835a3e3	feat: changed CompactWilsonExpClover exponentiation to Taylor expansion with Horner scheme.	2022-10-25 15:19:43 +01:00
Peter Boyle	477ebf24f4	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2022-10-04 11:19:43 -07:00
Peter Boyle	0d5639f707	Run script update	2022-10-04 11:13:41 -07:00
Peter Boyle	413312f9a9	Benchmark the halo construction. THe bye counts are out and should be doubled for SIMD directions	2022-10-04 11:12:59 -07:00
Peter Boyle	03508448f8	Remove verbose	2022-10-04 11:12:15 -07:00
Peter Boyle	e1e5c75023	Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM	2022-10-04 11:11:10 -07:00
Peter Boyle	9296299b61	Better commenting	2022-10-04 11:10:34 -07:00
Peter Boyle	913fbca74a	Merge pull request #410 from gkanwar/photon_and_sha_patches Photon.h and SHA256 patches	2022-08-31 18:01:45 -04:00
Gurtej Kanwar	60dfb49afa	Remove FP16 tests when FP16 is disabled	2022-08-21 17:29:55 +02:00
Gurtej Kanwar	554c238359	Update OpenSSL digest to use high-level methods This avoids deprecation warnings when compiling against OpenSSL 3.0 but should still be backwards compatible. It is the recommended way to use the digest API going forward.	2022-08-21 17:28:57 +02:00
Gurtej Kanwar	f922adf05e	Fix Photon ComplexField type	2022-08-21 16:16:18 +02:00
Peter Boyle	188d2c7a4d	PVC default, ignore ATS	2022-08-02 08:38:53 -07:00
Peter Boyle	17d7177105	Files for SYCL	2022-08-02 08:33:39 -07:00
Peter Boyle	bb0a0da47a	inon blocking caution due to SYCL	2022-08-02 08:09:43 -07:00
Peter Boyle	84110166e4	Fix the fence	2022-08-02 08:00:43 -07:00
Peter Boyle	d32b923b6c	Fencing on a stream in SYCL is needed. Didn't know that ... gulp	2022-08-02 07:58:04 -07:00
Peter Boyle	2ab1af5754	Ensure no synchronize and not optoin dependent	2022-07-19 09:51:06 -07:00
Peter Boyle	5f8892bf03	Mistake pointed out by Camilo	2022-07-19 09:31:51 -07:00
Peter Boyle	f14e7e51e7	Grid accelerator	2022-07-12 10:56:22 -07:00
Peter Boyle	042ab1a052	Update GridStd.h	2022-06-27 13:21:39 -04:00
Peter Boyle	2df98a99bc	Merge pull request #406 from giordano/patch-1 Update default value of gen-simd-width in README	2022-06-14 17:46:25 -04:00
Mosè Giordano	315ea18be2	Update default value of gen-simd-width in README	2022-06-14 22:41:05 +01:00
Peter Boyle	a9c2e1df03	Merge pull request #404 from rrhodgson/feature/json_nvcc Feature/json nvcc	2022-05-25 13:30:11 -04:00
Raoul Hodgson	da4daea57a	Updated json to latest release 3.10.5	2022-05-24 16:16:06 +01:00
Peter Boyle	af3b065add	Merge pull request #403 from fjosw/fix/cuda_11_5_warnings Fixed nvcc 11.5+ warnings	2022-05-24 11:10:02 -04:00
Raoul Hodgson	e346154c5d	Updated json CUDA compile guards	2022-05-24 15:48:01 +01:00
Fabian Joswig	7937ac2bab	fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in pugixml/pugixml.cc	2022-05-24 15:31:03 +01:00
Fabian Joswig	e909aeedf0	fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in Grid_Eigen_Dense.h	2022-05-24 15:29:42 +01:00
Fabian Joswig	bab8aa8eb0	fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in DisableWarnings.h	2022-05-24 15:27:40 +01:00
Peter Boyle	38b22f05be	Merge pull request #402 from fjosw/fix/clover_warnings fixed clover warnings	2022-05-24 10:05:27 -04:00
Raoul Hodgson	3ca0de1c40	Fix json write for vector<string>	2022-05-24 14:37:33 +01:00
Raoul Hodgson	c7205d2a73	Removed nvcc guards for json	2022-05-24 14:30:26 +01:00
Fabian Joswig	617c5362c1	fix: fixed warning: missing return statement at end of non-void function in CloverHelpers	2022-05-24 11:37:33 +01:00
Peter Boyle	083b58e66d	Merge pull request #401 from JPRichings/LocalCoheranceDeflation Local coherance batch deflation	2022-05-20 11:44:22 -04:00
Peter Boyle	633427a2df	Merge pull request #400 from JPRichings/wilson_sweep bench wilson sweep fix	2022-05-20 11:43:40 -04:00
JPRichings	2031d6910a	Merge branch 'paboyle:develop' into wilson_sweep	2022-05-20 16:20:23 +01:00
JPRichings	79e34b3eb4	Local Coherence batch deflation	2022-05-19 14:53:17 +01:00
JPRichings	4f3d581ab4	Merge branch 'paboyle:develop' into LocalCoheranceDeflation	2022-05-19 14:46:17 +01:00
Peter Boyle	d16427b837	Merge pull request #399 from fjosw/fix/Nc_neq_3 fix: assert for dimensions of compact Wilson clover moved to constructor	2022-05-17 09:03:42 -04:00
James Richings	4b1997e2f3	wilson sweep test	2022-05-16 15:58:33 +01:00
James Richings	8939d5dc73	bugfix: eo operator called in correct location	2022-05-16 00:28:28 +01:00
James Richings	b051e00de0	Additional Local Coherance Deflation operator()	2022-05-16 00:25:13 +01:00
Fabian Joswig	8aa75b492f	Merge branch 'develop' into fix/Nc_neq_3	2022-05-10 14:22:03 +01:00
Peter Boyle	0274f40686	Merge pull request #389 from mbruno46/mbruno-eclover Feature/expClover	2022-05-10 09:18:19 -04:00
Peter Boyle	77aa147ce5	Merge branch 'develop' into mbruno-eclover	2022-05-10 09:16:53 -04:00
Fabian Joswig	32facbd02a	fix: assert for dimensions of compact Wilson clover moved to constructor.	2022-05-10 10:53:22 +01:00
Peter Boyle	4de50ab146	Merge pull request #396 from fjosw/fix/readd_config.h fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am	2022-05-09 08:26:48 -04:00
Fabian Joswig	8b12a61097	fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am	2022-05-09 11:53:22 +01:00
Peter Boyle	79ea027c0b	Merge pull request #377 from RJHudspith/develop NERSC and ILDG for non-SU(3) configuration checkpoints	2022-05-03 08:55:48 -04:00
Peter Boyle	62339d437f	Merge pull request #387 from lehner/feature/gpt Parity mass terms for domain wall fermions to enable 4d eofa	2022-05-03 08:52:18 -04:00
Peter Boyle	698e745276	Merge pull request #390 from fjosw/feature/conserved_current_wilson Conserved current for wilson fermions	2022-05-03 08:51:10 -04:00
Peter Boyle	9a6e2c315d	Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge SteepestDescentGaugeFix now exits when the algorithm does not converge.	2022-05-03 08:49:26 -04:00
Fabian Joswig	e61fed87db	SteepestDescentGaugeFix now exits when the algorithm does not converge. This behaviour can be altered by setting err_on_no_converge to false.	2022-04-20 15:41:55 +01:00
Fabian Joswig	b8bc560b51	Test_wilson_conserved_current implemented, all 5d references removed.	2022-04-05 17:33:45 +01:00
Fabian Joswig	6bc2483d57	Merge branch 'feature/eclover' into feature/conserved_current_wilson	2022-04-05 15:26:49 +01:00
Fabian Joswig	82aecbf4cf	Test_wilson_conserved_current added	2022-04-05 15:26:39 +01:00
Mattia Bruno	ee23a76aa0	Merge pull request #2 from fjosw/feature/eclover Feature/eclover	2022-04-05 13:30:13 +02:00
Fabian Joswig	d7191e5a02	SeqConservedCurrent implemented for Wilson fermions	2022-04-05 11:48:56 +01:00
Fabian Joswig	c8a824425b	Error message added if another conserved current than vector is requested for Wilson type fermions.	2022-04-05 10:58:22 +01:00
Fabian Joswig	f23626a6b8	End scope by additional block in CloverHelpers.h	2022-04-02 16:08:15 +01:00
Fabian Joswig	6577a03d16	Explcitly closed views in Exponentiate_Clover	2022-04-01 18:39:12 +01:00
Fabian Joswig	427c8695fe	Change signs and prefactors for conserved current to mimic the 5d version.	2022-04-01 16:20:21 +01:00
Fabian Joswig	9e82c468ab	Multiplication of diagonal mass in exponentiate fixed for gpus	2022-04-01 15:54:43 +01:00
Fabian Joswig	603fd96747	Missing link multiplication added.	2022-04-01 10:58:56 +01:00
Fabian Joswig	fe993c0836	/=2 replaced by *=0.5	2022-03-31 17:08:17 +01:00
Fabian Joswig	cdf31d52c1	GaugeGrid and typo fixed	2022-03-31 17:04:35 +01:00
Fabian Joswig	0542eaf1da	First version of conserved current contraction for Wilson type quarks	2022-03-31 17:02:09 +01:00
Christoph Lehner	317bdcf158	nerscio parametrization	2022-03-24 13:10:47 +01:00
Mattia Bruno	9ca2c98882	Merge branch 'develop' of https://github.com/paboyle/Grid into mbruno-eclover	2022-03-22 15:31:37 +01:00
Mattia Bruno	53ae01a34a	Merge pull request #1 from fjosw/feature/eclover Feature/eclover	2022-03-15 15:23:35 +01:00
Christoph Lehner	76c294a7ba	open bc fix	2022-03-08 13:55:16 +01:00
Fabian Joswig	0c0c2b1e20	Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed.	2022-03-08 09:44:51 +00:00
Christoph Lehner	e2fc3a0f04	Merge pull request #28 from paboyle/develop Sync with Upstream	2022-03-08 09:58:51 +01:00
Fabian Joswig	451e7972fd	Reintroduced explicit inversion of the Clover term in case of the CompactExpClover because of the open boundary O(a) improvement. Changed the timing output to GridLogDebug	2022-03-07 17:43:33 +00:00
Fabian Joswig	56c089d347	Removed leftover comments	2022-03-07 16:40:20 +00:00
Fabian Joswig	acf740e44d	Merge pull request #1 from FelixPGZiegler/feature/eclover Feature/eclover	2022-03-07 16:25:11 +00:00
Felix Ziegler	182f513404	Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover	2022-03-07 15:22:04 +00:00
Felix Ziegler	d5b2323a57	included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover	2022-03-07 14:44:24 +00:00
FelixPGZiegler	bad18d4417	Merge branch 'paboyle:develop' into feature/eclover	2022-03-07 13:54:10 +00:00
Fabian Joswig	438caab25f	generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed.	2022-02-27 18:27:18 +00:00
Fabian Joswig	239e2c1ee6	tests: wilson clover cg tests now include compact variant as well as exponential wilson clover operators	2022-02-27 18:26:34 +00:00
Fabian Joswig	013dc2ef33	tests: core tests for wilson clover and wilson exp clover including compact version extended/added	2022-02-27 18:13:47 +00:00
Christoph Lehner	9616811c3d	Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt	2022-02-24 22:03:05 +01:00
Christoph Lehner	8a3002c03b	separate left and right masses for CayleyFermion5D	2022-02-24 22:02:56 +01:00
Mattia Bruno	71034f828e	attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig	2022-02-23 01:02:27 +01:00
Mattia Bruno	11437930c5	cleaned up definitions of wilsonclover fermions	2022-02-22 10:45:16 +01:00
Mattia Bruno	3d44aa9cb9	cleaned up cloverhelpers; fixed test compact_clover which runs	2022-02-22 01:10:19 +01:00
Mattia Bruno	2851870d70	expClover support via helpers template class	2022-02-22 00:05:43 +01:00
RJHudspith	0bd83cdbda	Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled.	2021-11-28 21:51:03 +01:00