Test_evec_compression changes:

Added ability to choose one of a variety of preselected basis sizes from the command line Fine lanczos now checks enough evecs are generated and resizes the output to Nstop and not the actual amount that converged (which can be larger)
Test_evec_compression enhancements:
2025-06-22 17:52:02 +01:00 · 2022-04-06 06:33:26 -07:00 · 2022-03-29 06:16:15 -07:00 · 2022-03-14 06:45:28 -07:00 · 2022-02-22 14:25:27 -05:00 · 2022-02-14 08:09:01 -08:00
125 changed files with 11006 additions and 2907 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@ -34,6 +34,9 @@ directory
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 #if defined __GNUC__ 
 #pragma GCC diagnostic ignored "-Wpsabi"
 #endif
 //disables and intel compiler specific warning (in json.hpp)
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -292,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -49,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@ -68,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@ -80,6 +82,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;
    GridBase* DoublePrecGrid = src_d_in.Grid();
    //Generate precision change workspaces
    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
@ -97,6 +104,7 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@ -120,7 +128,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@ -130,6 +138,7 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@ -138,7 +147,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@ -150,6 +159,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@ -182,6 +182,9 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -0,0 +1,411 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d, wk_f_from_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f, wk_d_from_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d, wk_f_from_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@ -113,43 +113,7 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-
+};
  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
    int Nevec = (int)evec_coarse.size();
    int Nsrc = (int)src.size();
    // make temp variables
    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
    //Preporcessing
    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    guess_coarse[j] = Zero();
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockProject(src_coarse[j],src[j],subspace);
    }
    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
    for (int i=0;i<Nevec;i++)
    {
      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
      const CoarseField & tmp = evec_coarse[i];
      for (int j=0;j<Nsrc;j++)
      {
        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
      }
    }
    //postprocessing
    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockPromote(guess_coarse[j],guess[j],subspace);
    guess[j].Checkerboard() = src[j].Checkerboard();
    }
  };
  };
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@ -44,6 +44,7 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@ -155,6 +156,7 @@ public:
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@ -181,8 +183,16 @@ public:
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@ -201,13 +211,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@ -285,6 +295,10 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@ -328,6 +342,8 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@ -376,25 +392,31 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
+    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@ -405,6 +427,14 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@ -29,6 +29,8 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@ -0,0 +1,42 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -142,15 +142,6 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
@ -168,22 +159,6 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
  return ssum;
 }
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deterministic Reduction operations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 template <class Iterator>
-int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  int device;
 #ifdef GRID_CUDA
@ -37,13 +37,13 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  /*  
+  
  std::cout << GridLogDebug << "GPU has:" << std::endl;
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  */  
+  
  if (warpSize != WARP_SIZE) {
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
    exit(EXIT_FAILURE);
@ -53,12 +53,12 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
-    return 0;
+    exit(EXIT_FAILURE);
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
-  return 1;
+  
 }
 template <class sobj, class Iterator>
@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_objectD sobj;
  typedef decltype(lat) Iterator;
@ -207,9 +207,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
-  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  assert(ok);
  Integer smemSize = numThreads * sizeof(sobj);
  Vector<sobj> buffer(numBlocks);
@ -220,54 +218,6 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  auto result = buffer_v[0];
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,1,{
 	buf[ss] = dat[ss*words+w];
      });
    ret_p[w] = sumD_gpu_small(tbuf,osites);
  }
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  Integer nsimd= vobj::Nsimd();
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  if ( ok ) {
    ret = sumD_gpu_small(lat,osites);
  } else {
    ret = sumD_gpu_large(lat,osites);
  }
  return ret;
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -280,13 +230,6 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
  return result;
 }
-template <class vobj>
+
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -32,8 +32,9 @@
 #include <random>
 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
 #include <Grid/random/gaussian.h>
 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@ -142,8 +143,8 @@ public:
  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
+  std::vector<Grid::gaussian_distribution<RealD> >       _gaussian;
-  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  //  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;
  ///////////////////////
@ -243,8 +244,8 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    //    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }
@ -357,8 +358,8 @@ public:
    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    //    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
@ -515,11 +516,11 @@ public:
 template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
 template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
-template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
 template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
 template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
-template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -855,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@ -1080,54 +1080,96 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
-//Convert a Lattice from one precision to another
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
-template<class VobjOut, class VobjIn>
+class precisionChangeWorkspace{
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+  std::pair<Integer,Integer>* fmap_device; //device pointer
-{
+public:
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
-  for(int d=0;d<out.Grid()->Nd();d++){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+    assert(out_grid->Nd() == in_grid->Nd());
-  }
+    for(int d=0;d<out_grid->Nd();d++){
-  out.Checkerboard() = in.Checkerboard();
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
  GridBase *in_grid=in.Grid();
  GridBase *out_grid = out.Grid();
  typedef typename VobjOut::scalar_object SobjOut;
  typedef typename VobjIn::scalar_object SobjIn;
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
    out_icoor[lane].resize(ndim);
    out_grid->iCoorFromIindex(out_icoor[lane], lane);
  }
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
  autoView( out_v , out, CpuWrite);
  thread_for(out_oidx,out_grid->oSites(),{
    Coordinate out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
    Coordinate lcoor(out_grid->Nd());
    for(int lane=0; lane < out_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
      ptrs[lane] = &in_slex_conv[llex];
    }
-    merge(out_v[out_oidx], ptrs, 0);
+    int Nsimd_out = out_grid->Nsimd();
-  });
+
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another
 //Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@ -31,7 +31,6 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include <map>
 #include <pwd.h>
@ -655,8 +654,7 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    const std::string stNC = std::to_string( Nc ) ;
+    ildgfmt.field     = std::string("su3gauge");
    ildgfmt.field          = std::string("su"+stNC+"gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@ -873,8 +871,7 @@ class IldgReader : public GridLimeReader {
    } else { 
      assert(found_ildgFormat);
-      const std::string stNC = std::to_string( Nc ) ;
+      assert ( ildgFormat_.field == std::string("su3gauge") );
      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@ -882,7 +879,7 @@ class IldgReader : public GridLimeReader {
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@ -6,8 +6,8 @@
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@ -203,24 +203,20 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  assert( Nc < 4 && Nc > 1 ) ;
+  const int x=0;
  const int y=1;
  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    #if Nc == 2
+    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
+    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
+    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #else
      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #endif
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@ -282,6 +278,7 @@ struct GaugeSimpleMunger{
 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@ -320,8 +317,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
+      for(int i=0;i<2;i++){
-	for(int j=0;j<Nc;j++){
+	for(int j=0;j<3;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@ -333,8 +330,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
+      for(int i=0;i<2;i++){
-	for(int j=0;j<Nc;j++){
+	for(int j=0;j<3;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@ -9,7 +9,6 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -31,8 +30,6 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <string>
 NAMESPACE_BEGIN(Grid);
 using namespace Grid;
@ -42,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@ -148,17 +147,15 @@ public:
    std::string format(header.floating_point);
-    const int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32big = (format == std::string("IEEE32BIG"));
-    const int ieee32    = (format == std::string("IEEE32"));
+    int ieee32    = (format == std::string("IEEE32"));
-    const int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
-    const int ieee64    = (format == std::string("IEEE64") || \
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
 			   format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    const std::string stNC = std::to_string( Nc ) ;
+    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@ -169,7 +166,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
+    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@ -203,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
@ -214,29 +211,27 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"),
+					std::string ens_label = std::string("DWF"))
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
+    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"),
+					std::string ens_label = std::string("DWF"))
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
-    header.sequence_number = sequence_number;
+    ///////////////////////////////////////////
-    header.ensemble_id     = ens_id;
+    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
    header.hdr_version     = "1.0" ;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@ -250,14 +245,10 @@ public:
    uint64_t offset;
-    // Sod it -- always write NcxNc double
+    // Sod it -- always write 3x3 double
-    header.floating_point  = std::string("IEEE64BIG");
+    header.floating_point = std::string("IEEE64BIG");
-    const std::string stNC = std::to_string( Nc ) ;
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
-    if( two_row ) {
+    GaugeSimpleUnmunger<fobj3D,sobj> munge;
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
    } else {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@ -265,15 +256,8 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    if( two_row ) {
+    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-      Gauge3x2unmunger<fobj2D,sobj> munge;
+					      nersc_csum,scidac_csuma,scidac_csumb);
      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    } else {
      GaugeSimpleUnmunger<fobj3D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@ -305,7 +289,8 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);
-    uint64_t offset;
+	uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@ -345,7 +330,7 @@ public:
    GridBase *grid = parallel.Grid();
-    uint64_t offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);
    FieldMetaData clone(header);
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@ -220,6 +235,16 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  GparityWilsonImplParams() : twists(Nd, 0) {};
 };
@ -65,7 +66,8 @@ struct StaggeredImplParams {
 				    RealD, tolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq);
+				    int,   BoundsCheckFreq,
 				    RealD, BoundsCheckTol);
  // MaxIter and tolerance, vectors??
@ -76,15 +78,61 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20)
+				int _BoundsCheckFreq=20,
 				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
 	degree(_degree),
        precision(_precision),
-        BoundsCheckFreq(_BoundsCheckFreq){};
+        BoundsCheckFreq(_BoundsCheckFreq),
        BoundsCheckTol(_BoundsCheckTol){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -68,16 +68,9 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
+  RealD Mass(void) { return mass; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
-    mass_plus=mass_minus=_mass; 
+    mass=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
    mass_plus=_mass_plus;
    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@ -115,7 +108,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
  //    protected:
-  RealD mass_plus, mass_minus;
+  RealD mass;
  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@ -1,433 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 ////////////////////////////////////////////
 // Standard Clover
 //   (4+m0) + csw * clover_term
 // Exp Clover
 //   (4+m0) * exp(csw/(4+m0) clover_term)
 //   = (4+m0) + csw * clover_term + ...
 ////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////
 // Generic Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
    GridBase *grid = CloverTerm.Grid();
    CloverTerm += diag_mass;
    int lvol = grid->lSites();
    int DimRep = Impl::Dimension;
    {
      autoView(CTv,CloverTerm,CpuRead);
      autoView(CTIv,CloverTermInv,CpuWrite);
      thread_for(site, lvol, {
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
        peekLocalSite(Qx, CTv, lcoor);
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++){
                auto zz =  Qx()(j, k)(a, b);
                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
              }
        EigenInvCloverOp = EigenCloverOp.inverse();
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++)
                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
               pokeLocalSite(Qxinv, CTIv, lcoor);
      });
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Generic Exp Clover
 //////////////////////////////////
 template<class Impl>
 class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef WilsonCloverHelpers<Impl> Helpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
  }
 };
 //////////////////////////////////
 // Compact Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
                            public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    Clover += diag_mass;
  }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
                          CloverTriangleField& Triangle,
                          RealD csw_t, RealD diag_mass) {
    // Do nothing
  }
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Compact Exp Clover
 //////////////////////////////////
 template<class Impl>
 class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    // do nothing!
    // mass term is multiplied to exp(Clover) below
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
  	  typedef iMatrix<ComplexD,6> mat;
  	  RealD qn[6];
  	  RealD qnold[6];
  	  RealD p[5];
  	  RealD trA2, trA3, trA4;
  	  mat A2, A3, A4, A5;
  	  A2 = alpha * alpha * arg * arg;
  	  A3 = alpha * arg * A2;
  	  A4 = A2 * A2;
  	  A5 = A2 * A3;
  	  trA2 = toReal( trace(A2) );
  	  trA3 = toReal( trace(A3) );
  	  trA4 = toReal( trace(A4));
  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
  	  p[3] = trA3 / 3.0;
  	  p[4] = 0.5 * trA2;
  	  qnold[0] = cN[Niter];
  	  qnold[1] = 0.0;
  	  qnold[2] = 0.0;
  	  qnold[3] = 0.0;
  	  qnold[4] = 0.0;
  	  qnold[5] = 0.0;
  	  for(int i = Niter-1; i >= 0; i--)
  	  {
  	   qn[0] = p[0] * qnold[5] + cN[i];
  	   qn[1] = p[1] * qnold[5] + qnold[0];
  	   qn[2] = p[2] * qnold[5] + qnold[1];
  	   qn[3] = p[3] * qnold[5] + qnold[2];
  	   qn[4] = p[4] * qnold[5] + qnold[3];
  	   qn[5] = qnold[4];
  	   qnold[0] = qn[0];
  	   qnold[1] = qn[1];
  	   qnold[2] = qn[2];
  	   qnold[3] = qn[3];
  	   qnold[4] = qn[4];
  	   qnold[5] = qn[5];
  	  }
  	  mat unit(1.0);
  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
    }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Diagonal.Grid();
    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
    //
    // Implementation completely in Daniel's layout
    //
    // Taylor expansion with Cayley-Hamilton recursion
    // underlying Horner scheme as above
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++){
      cn[i] = cn[i-1] / RealD(i);
    }
      // Taken over from Daniel's implementation
      conformable(Diagonal, Triangle);
      long lsites = grid->lSites();
    {
      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
      typedef iMatrix<ComplexD,6> mat;
      autoView(diagonal_v,  Diagonal,  CpuRead);
      autoView(triangle_v,  Triangle,  CpuRead);
      autoView(diagonalExp_v, Diagonal, CpuWrite);
      autoView(triangleExp_v, Triangle, CpuWrite);
      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
    	  mat srcCloverOpUL(0.0); // upper left block
    	  mat srcCloverOpLR(0.0); // lower right block
    	  mat ExpCloverOp;
        scalar_object_diagonal diagonal_tmp     = Zero();
        scalar_object_diagonal diagonal_exp_tmp = Zero();
        scalar_object_triangle triangle_tmp     = Zero();
        scalar_object_triangle triangle_exp_tmp = Zero();
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
        peekLocalSite(triangle_tmp, triangle_v, lcoor);
        int block;
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
        		if (i == j){
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
        		}
        		else{
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
        		}
        	}
        }
        block = 1;
        for(int i = 0; i < 6; i++){
          	for(int j = 0; j < 6; j++){
           		if (i == j){
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
           		}
           		else{
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
           		}
            }
        }
        // exp(Clover)
        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
            	if (i == j){
            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
            	}
            	else if(i < j){
            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
            	}
           	}
        }
        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 1;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
              	if (i == j){
              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
               	}
               	else if(i < j){
               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
               	}
            }
        }
        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
      });
    }
    Diagonal *= diag_mass;
    Triangle *= diag_mass;
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@ -31,7 +31,6 @@
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@ -86,7 +85,7 @@ NAMESPACE_BEGIN(Grid);
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
-template<class Impl, class CloverHelpers>
+template<class Impl>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -138,52 +138,38 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
-template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
-template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
-typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
-typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
-typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
-typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
-template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
+typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
-template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
+typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
-typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
+typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
-typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
+typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
-typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
+typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
-typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
-typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
-typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
-typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@ -30,6 +30,18 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+    //If this site is an global boundary site, perform the G-parity flavor twist
-
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@ -197,6 +209,19 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@ -207,14 +232,19 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-        
+
-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
-          
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
-      LatticeCoordinate(coor,mu);
+    for(int mu=0;mu<Nd-1;mu++){
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@ -260,6 +290,38 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@ -298,28 +360,48 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
-  
+ 
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
+    int Ls=Btilde.Grid()->_fdimensions[0];
-    int Ls = Btilde.Grid()->_fdimensions[0];
+    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      GridBase *GaugeGrid = mat.Grid();
-      autoView( Atilde_v , Atilde, CpuRead);
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
-      autoView( Btilde_v , Btilde, CpuRead);
+
-      thread_for(ss,tmp.Grid()->oSites(),{
+      if( Params.twists[mu] ){
-	  for (int s = 0; s < Ls; s++) {
+	LatticeCoordinate(coor,mu);
-	    int sF = s + Ls * ss;
+      }
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      autoView( mat_v , mat, AcceleratorWrite);
-	  }
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-	});
+      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@ -32,7 +32,6 @@
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@ -52,7 +51,7 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
-template<class Impl, class CloverHelpers>
+template <class Impl>
 class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@ -209,8 +209,6 @@ public:
 };
 ////////////////////////////////////////////////////////
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@ -47,6 +47,8 @@ class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass_plus(_mass), mass_minus(_mass)
+  mass(_mass)
 { 
 }
@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass_minus*cee[Ls-1];
+      lower[s] = mass*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass_plus*cee[0];
+      upper[s] = mass*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass_plus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_minus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass_minus*cs[Ls-1];
+      lower[s] =-mass*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass_plus*cs[0];
+      upper[s] =-mass*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      leem[i]=mass_minus*cee[Ls-1]/bee[0];
+      leem[i]=mass*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      ueem[i]=mass_plus;
+      ueem[i]=mass;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
  { 
-    Coeff_t delta_d=mass_minus*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@ -642,10 +642,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@ -781,8 +777,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@ -32,18 +32,17 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
-template<class Impl, class CloverHelpers>
+template<class Impl>
-CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
+CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
-                                                                            GridCartesian& Fgrid,
+                                                             GridCartesian& Fgrid,
-                                                                            GridRedBlackCartesian& Hgrid,
+                                                             GridRedBlackCartesian& Hgrid,
-                                                                            const RealD _mass,
+                                                             const RealD _mass,
-                                                                            const RealD _csw_r,
+                                                             const RealD _csw_r,
-                                                                            const RealD _csw_t,
+                                                             const RealD _csw_t,
-                                                                            const RealD _cF,
+                                                             const RealD _cF,
-                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
+                                                             const WilsonAnisotropyCoefficients& clover_anisotropy,
-                                                                            const ImplParams& impl_p)
+                                                             const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
@ -59,55 +58,50 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
-  if (open_boundaries) {
+  if (open_boundaries)
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
@ -115,8 +109,8 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
@ -124,20 +118,20 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
@ -150,13 +144,13 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
@ -169,23 +163,23 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!open_boundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
@ -257,7 +251,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force,
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
+      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
@ -267,18 +261,18 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force,
  force += clover_force;
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
+void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
@ -291,8 +285,8 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const Fermio
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
-template<class Impl, class CloverHelpers>
+template<class Impl>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
+void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
@ -324,27 +318,22 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
-  // Handle mass term based on clover policy
+  TmpOriginal += this->diag_mass;
-  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
+
  // Convert the data layout of the clover term
  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Exponentiate the clover (nothing happens in case of the standard clover)
  double t5 = usecond();
  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
  // Possible modify the boundary values
-  double t6 = usecond();
+  double t5 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
-  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
+  // Invert the clover term in the improved layout
-  double t7 = usecond();
+  double t6 = usecond();
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
  // Fill the remaining clover fields
-  double t8 = usecond();
+  double t7 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
@ -355,19 +344,20 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
-  double t9 = usecond();
+  double t8 = usecond();
-
+#if 0
-  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
+  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
-  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
-  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+            << ", allocations = "               << (t2 - t1) / 1e6
-  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+            << ", field strength = "            << (t3 - t2) / 1e6
-  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+            << ", fill clover = "               << (t4 - t3) / 1e6
-  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
+            << ", convert = "                   << (t5 - t4) / 1e6
-  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
+            << ", boundaries = "                << (t6 - t5) / 1e6
-  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
+            << ", inversions = "                << (t7 - t6) / 1e6
-  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
+            << ", pick cbs = "                  << (t8 - t7) / 1e6
-  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
+            << ", total = "                     << (t8 - t0) / 1e6
-  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
+            << std::endl;
 #endif
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@ -34,8 +34,8 @@
 NAMESPACE_BEGIN(Grid);
-template<class Impl, class CloverHelpers>
+template<class Impl>
-WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
+WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
@ -74,8 +74,8 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
 }
 // *NOT* EO
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@ -89,8 +89,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, Fermion
  out += temp;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@ -104,8 +104,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, Ferm
  out += temp;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
@ -131,11 +131,47 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
-   
+  CloverTerm += diag_mass;
  double t4 = usecond();
-  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
+  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;
  double t5 = usecond();
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    thread_for(site, lvol, {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++){
 	      auto zz =  Qx()(j, k)(a, b);
 	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 	    }
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
      EigenInvCloverOp = EigenCloverOp.inverse();
      //std::cout << EigenInvCloverOp << std::endl;
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
    });
  }
  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@ -148,44 +184,48 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-  double t6 = usecond();
+  double t7 = usecond();
-  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
+#if 0
-  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
-  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
-  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+            << ", allocations = "               << (t2 - t1) / 1e6
-  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+            << ", field strength = "            << (t3 - t2) / 1e6
-  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
+            << ", fill clover = "               << (t4 - t3) / 1e6
-  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
+            << ", misc = "                      << (t5 - t4) / 1e6
-  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
+            << ", inversions = "                << (t6 - t5) / 1e6
            << ", pick cbs = "                  << (t7 - t6) / 1e6
            << ", total = "                     << (t7 - t0) / 1e6
            << std::endl;
 #endif
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
  CloverField *Clover;
@ -238,8 +278,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
@ -309,7 +349,7 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
+      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
@ -320,15 +360,15 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
 }
 // Derivative parts
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -4,13 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2022
+Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -600,47 +599,11 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
+  assert(0);
  PropagatorField tmp_shifted(UGrid);
  PropagatorField g5Lg5(UGrid);
  PropagatorField R(UGrid);
  PropagatorField gmuR(UGrid);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  g5Lg5=g5*q_in_1*g5;
  tmp_shifted=Cshift(q_in_2,mu,1);
  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
  gmuR=gmu*R;
  q_out=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
  tmp_shifted=Cshift(q_in_1,mu,1);
  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
  g5Lg5=g5*g5Lg5*g5;
  R=q_in_2;
  gmuR=gmu*R;
  q_out-=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
 }
@ -654,51 +617,9 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
+  assert(0);
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  PropagatorField L(UGrid);
  PropagatorField zz (UGrid);
  zz=Zero();
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  tmp = Cshift(q_in,mu,1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
  tmp = q_in *lattice_cmplx;
  tmp = Cshift(tmp,mu,-1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
  tmp = -( Utmp + gmu*Utmp );
  // Mask the time
  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
    unsigned int t0 = 0;
    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
  } else {
    tmp = where((lcoor>=tmin+tshift),tmp,zz);
  }
  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
@ -9,7 +9,6 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -33,12 +32,10 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
+template class CompactWilsonCloverFermion<IMPLEMENTATION>; 
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
@ -8,8 +8,7 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-    Author: Mattia Bruno <mattia.bruno@cern.ch>
+
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@ -32,12 +31,10 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
+template class WilsonCloverFermion<IMPLEMENTATION>; 
 template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@ -18,10 +18,6 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "
 COMPACT_WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD "
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@ -44,23 +40,13 @@ EOF
 done
-CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
 for impl in $WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done
 CC_LIST="CompactWilsonCloverFermionInstantiation"
 for impl in $COMPACT_WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
@ -77,14 +63,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done
 # overwrite the .cc file in Gparity directories
 for impl in $GDWF_IMPL_LIST
 do
-  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
+  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc 
 done
@ -98,7 +84,7 @@ for impl in $STAG_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@ -69,6 +69,11 @@ public:
    return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Same as Cshift for periodic BCs
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline bool isPeriodicGaugeField(void) { return true; }
 };
@ -110,6 +115,11 @@ public:
      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }
  //If mu is a conjugate BC direction
  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
  //       = U^T_\mu(L-1)  | x_\mu == 0
  //else
  //Out(x) = U^dag_\mu(x-mu mod L)
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
  {
@ -129,6 +139,13 @@ public:
      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }
  //If mu is a conjugate BC direction
  //Out(x) = S_\mu(x+mu)  | x_\mu != L-1
  //       = S*_\mu(x+mu)  | x_\mu == L-1
  //else
  //Out(x) = S_\mu(x+mu mod L)
  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
  {
    assert(_conjDirs.size() == Nd);
@ -138,6 +155,27 @@ public:
      return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  //For conjugate BC direction
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = U*_\mu(0)  | x_\mu == L-1
  //shift = -1
  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
  //       = U*_\mu(L-1)  | x_\mu == 0
  //else
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu mod L)
  //shift = -1
  //Out(x) = U_\mu(x-\hat\mu mod L)
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    assert(_conjDirs.size() == Nd);
    if(_conjDirs[mu]) 
      return ConjugateBC::CshiftLink(Link,mu,shift);
    else     
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@ -40,13 +40,66 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }
    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
       for noise X (aka GaussNoise).
       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
    */
    template<class Field> void InversePowerBoundsCheck(int inv_pow,
 						       int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
 						       Field &GaussNoise,
 						       MultiShiftFunction &ApproxNegPow) 
    {
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Y(FermionGrid);
      Field Z(FermionGrid);
      Field tmp1(FermionGrid), tmp2(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
      tmp1 = X;
      Field* in = &tmp1;
      Field* out = &tmp2;
      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
 	msCG(HermOp, *in, *out); //backwards conventions!
 	if(i!=inv_pow-1) std::swap(in, out);
      }
      Z = *out;
      RealD Nz = norm2(Z);
      HermOp.HermOp(Z,Y);
      RealD Ny = norm2(Y);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
    }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@ -44,6 +44,10 @@ NAMESPACE_BEGIN(Grid);
  // Exact one flavour implementation of DWF determinant ratio //
  ///////////////////////////////////////////////////////////////
  //Note: using mixed prec CG for the heatbath solver in this action class will not work
  //      because the L, R operators must have their shift coefficients updated throughout the heatbath step
  //      You will find that the heatbath solver simply won't converge.
  //      To use mixed precision here use the ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction variant below
  template<class Impl>
  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
  {
@ -57,37 +61,60 @@ NAMESPACE_BEGIN(Grid);
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> SolverHB;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBR;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverR;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverR;
      FermionField Phi; // the pseudofermion field for this trajectory
      RealD norm2_eta; //|eta|^2 where eta is the random gaussian field used to generate the pseudofermion field
      bool initial_action; //true for the first call to S after refresh, for which the identity S = |eta|^2 holds provided the rational approx is good
    public:
      //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
      virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
 	AbstractEOFAFermion<Impl>&op = LorR == 0 ? Lop : Rop;
 	op.RefreshShiftCoefficients(to);
      }
      //Use the same solver for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& CG, 
 					      Params& p, 
 					      bool use_fc=false) 
-	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,p,use_fc) {};
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,CG,p,use_fc) {};
-	
+
      //Use the same solver for L,R in the heatbath but different solvers elsewhere
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
-					      OperatorFunction<FermionField>& HeatbathCG, 
+					      OperatorFunction<FermionField>& HeatbathCG,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false)
 	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,HeatbathCG,HeatbathCG, ActionCGL, ActionCGR, DerivCGL,DerivCGR,p,use_fc) {};
      //Use different solvers for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false) : 
        Lop(_Lop), 
 	Rop(_Rop), 
-	SolverHB(HeatbathCG,false,true),
+	SolverHBL(HeatbathCGL,false,true), SolverHBR(HeatbathCGR,false,true),
 	SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), 
 	DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), 
 	Phi(_Lop.FermionGrid()), 
 	param(p), 
-        use_heatbath_forecasting(use_fc)
+	use_heatbath_forecasting(use_fc),
 	initial_action(false)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);
@ -97,6 +124,8 @@ NAMESPACE_BEGIN(Grid);
        PowerNegHalf.Init(remez, param.tolerance, true);
      };
      const FermionField &getPhi() const{ return Phi; }
      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
      virtual std::string LogParameters() {
@ -117,6 +146,19 @@ NAMESPACE_BEGIN(Grid);
        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
        // 
        RealD scale = std::sqrt(0.5);
        FermionField eta    (Lop.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
@ -124,12 +166,10 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
+     void refresh(const GaugeField &U, const FermionField &eta) {
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField eta         (Lop.FermionGrid());
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        FermionField Forecast_src(Lop.FermionGrid());
@ -140,11 +180,6 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
        // Seed with Gaussian noise vector (var = 0.5)
        RealD scale = std::sqrt(0.5);
        gaussian(pRNG,eta);
        eta = eta * scale;
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
@ -160,15 +195,16 @@ NAMESPACE_BEGIN(Grid);
        tmp[1] = Zero();
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Lop.RefreshShiftCoefficients(-gamma_l);
+          heatbathRefreshShiftCoefficients(0, -gamma_l);
 	  //Lop.RefreshShiftCoefficients(-gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            SolverHB(Lop, CG_src, CG_soln);
+            SolverHBL(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero(); // Just use zero as the initial guess
-            SolverHB(Lop, CG_src, CG_soln);
+	    SolverHBL(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
@ -187,15 +223,16 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
+	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
          //Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero();
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
@ -205,49 +242,119 @@ NAMESPACE_BEGIN(Grid);
        Phi = Phi + tmp[1];
        // Reset shift coefficients for energy and force evals
-        Lop.RefreshShiftCoefficients(0.0);
+        //Lop.RefreshShiftCoefficients(0.0);
-        Rop.RefreshShiftCoefficients(-1.0);
+        //Rop.RefreshShiftCoefficients(-1.0);
 	heatbathRefreshShiftCoefficients(0, 0.0);
 	heatbathRefreshShiftCoefficients(1, -1.0);
 	//Mark that the next call to S is the first after refresh
 	initial_action = true;
 	// Bounds check
 	RealD EtaDagEta = norm2(eta);
 	norm2_eta = EtaDagEta;
 	//	RealD PhiDagMPhi= norm2(eta);
      };
-      void Meofa(const GaugeField& U,const FermionField &phi, FermionField & Mphi) 
+      void Meofa(const GaugeField& U,const FermionField &in, FermionField & out) 
      {
 #if 0
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
-        FermionField spProj_Phi(Lop.FermionGrid());
+        FermionField spProj_in(Lop.FermionGrid());
 	FermionField mPhi(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-	mPhi = phi;
+	out = in;
        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        spProj(in, spProj_in, -1, Lop.Ls);
-        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        Lop.Omega(spProj_in, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverL(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
-	mPhi = mPhi -  Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], -1, Lop.Ls);
 	out = out -  Lop.k * tmp[1];
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
-        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        spProj(in, spProj_in, 1, Rop.Ls);
-        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        Rop.Omega(spProj_in, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverR(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
-        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], 1, Rop.Ls);
-#endif
+
        out = out + Rop.k * tmp[1];
      }
      //Due to the structure of EOFA, it is no more expensive to compute the inverse of Meofa
      //To ensure correctness we can simply reuse the heatbath code but use the rational approx
      //f(x) = 1/x   which corresponds to alpha_0=0,  alpha_1=1,  beta_1=0 => gamma_1=1
      void MeofaInv(const GaugeField &U, const FermionField &in, FermionField &out) {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
 	// = 1 * \eta
        out = in;
        // LH terms:
        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
        spProj(in, tmp[0], -1, Lop.Ls);
        Lop.Omega(tmp[0], tmp[1], -1, 0);
        G5R5(CG_src, tmp[1]);
        {
          heatbathRefreshShiftCoefficients(0, -1.); //-gamma_1 = -1.
 	  CG_soln = Zero(); // Just use zero as the initial guess
 	  SolverHBL(Lop, CG_src, CG_soln);
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = Lop.k * tmp[0];
        }
        Lop.Omega(tmp[1], tmp[0], -1, 1);
        spProj(tmp[0], tmp[1], -1, Lop.Ls);
        out = out + tmp[1];
        // RH terms:
        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
        //          - \beta_l\gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
        spProj(in, tmp[0], 1, Rop.Ls);
        Rop.Omega(tmp[0], tmp[1], 1, 0);
        G5R5(CG_src, tmp[1]);
        {
 	  heatbathRefreshShiftCoefficients(1, 0.); //-gamma_1 * beta_1 = 0
 	  CG_soln = Zero();
 	  SolverHBR(Rop, CG_src, CG_soln);
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = - Rop.k * tmp[0];
        }
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        spProj(tmp[0], tmp[1], 1, Rop.Ls);
        out = out + tmp[1];
        // Reset shift coefficients for energy and force evals
 	heatbathRefreshShiftCoefficients(0, 0.0);
 	heatbathRefreshShiftCoefficients(1, -1.0);
      };
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
@ -271,7 +378,7 @@ NAMESPACE_BEGIN(Grid);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
@ -281,6 +388,26 @@ NAMESPACE_BEGIN(Grid);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
 	if(initial_action){
 	  //For the first call to S after refresh,  S = |eta|^2. We can use this to ensure the rational approx is good
 	  RealD diff = action - norm2_eta;
 	  //S_init = eta^dag M^{-1/2} M M^{-1/2} eta
 	  //S_init - eta^dag eta =  eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta
 	  //If approximate solution
 	  //S_init - eta^dag eta =  eta^dag ( [M^{-1/2}+\delta M^{-1/2}] M [M^{-1/2}+\delta M^{-1/2}] - 1 ) eta
 	  //               \approx  eta^dag ( \delta M^{-1/2} M^{1/2} + M^{1/2}\delta M^{-1/2} ) eta
 	  // We divide out |eta|^2 to remove source scaling but the tolerance on this check should still be somewhat higher than the actual approx tolerance
 	  RealD test = fabs(diff)/norm2_eta; //test the quality of the rational approx
 	  std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl;
 	  std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << "  expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl;
 	  assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
 	  initial_action = false;
 	}
        return action;
      };
@ -329,6 +456,40 @@ NAMESPACE_BEGIN(Grid);
      };
  };
  template<class ImplD, class ImplF>
  class ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction : public ExactOneFlavourRatioPseudoFermionAction<ImplD>{
  public:
    INHERIT_IMPL_TYPES(ImplD);
    typedef OneFlavourRationalParams Params;
  private:
    AbstractEOFAFermion<ImplF>& LopF; // the basic LH operator
    AbstractEOFAFermion<ImplF>& RopF; // the basic RH operator
  public:
    virtual std::string action_name() { return "ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction"; }
    //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
    virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
      AbstractEOFAFermion<ImplF> &op = LorR == 0 ? LopF : RopF;
      op.RefreshShiftCoefficients(to);
      this->ExactOneFlavourRatioPseudoFermionAction<ImplD>::heatbathRefreshShiftCoefficients(LorR,to);
    }
    ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction(AbstractEOFAFermion<ImplF>& _LopF, 
 							     AbstractEOFAFermion<ImplF>& _RopF,
 							     AbstractEOFAFermion<ImplD>& _LopD, 
 							     AbstractEOFAFermion<ImplD>& _RopD,
 							     OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 							     OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 							     OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 							     Params& p, 
 							     bool use_fc=false) : 
    LopF(_LopF), RopF(_RopF), ExactOneFlavourRatioPseudoFermionAction<ImplD>(_LopD, _RopD, HeatbathCGL, HeatbathCGR, ActionCGL, ActionCGR, DerivCGL, DerivCGR, p, use_fc){}
  };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@ -0,0 +1,372 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators
    /////////////////////////////////////////////////////////
    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
 	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
 	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
 	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
       BIG WARNING:	   
       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
       Thus for DWF the numerator operator is the Pauli-Villars operator
       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
    */
    template<class Impl>
    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef RationalActionParams Params;
      Params param;
      //For action evaluation
      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
      //For the MD integration
      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
 	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
 	double error = remez.generateApprox(approx_degree,1,inv_pow);	
 	if(error > CG_tolerance)
 	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
 	approx.Init(remez, CG_tolerance,false);
 	approx_inv.Init(remez, CG_tolerance,true);
      }
    protected:
      static constexpr bool Numerator = true;
      static constexpr bool Denominator = false;
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const GaugeField &U){
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
      }
    public:
      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						     FermionOperator<Impl>  &_DenOp, 
 						     const Params & p
 						     ) : 
 	NumOp(_NumOp), 
 	DenOp(_DenOp), 
 	PhiOdd (_NumOp.FermionRedBlackGrid()),
 	PhiEven(_NumOp.FermionRedBlackGrid()),
 	param(p) 
      {
 	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	//Generate approximations for action eval
 	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	//Generate approximations for MD
 	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
 	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	}else{
 	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
 	  ApproxPowerMD = ApproxPowerAction; 
 	  ApproxNegPowerMD = ApproxNegPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
 	  ApproxHalfPowerMD = ApproxHalfPowerAction;
 	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
 	}
 	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
      };
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      //Access the fermion field
      const FermionField &getPhiOdd() const{ return PhiOdd; }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField eta(NumOp.FermionGrid());	
 	// P(eta) \propto e^{- eta^dag eta}
 	//	
 	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
 	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
 	RealD scale = std::sqrt(0.5);
 	gaussian(pRNG,eta);	eta=eta*scale;
 	refresh(U,eta);
      }
      //Allow for manual specification of random field for testing
      void refresh(const GaugeField &U, const FermionField &eta) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
 	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
 	//
 	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	ImportGauge(U);
 	// MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
 	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
 	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
 	ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
 	// Randomly apply rational bounds checks.
 	int rcheck = rand();
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
 	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
 	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
 	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
 	}
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	RealD action = norm2(Y);
 	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
 	const int n_f  = ApproxNegPowerMD.poles.size();
 	const int n_pv = ApproxHalfPowerMD.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	ImportGauge(U);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)	
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
 	for(int k=0;k<n_f;k++){
 	  ak = ApproxNegPowerMD.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
 	for(int k=0;k<n_pv;k++){
          ak = ApproxHalfPowerMD.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
 	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
      };
    };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@ -0,0 +1,93 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
    // cf. GeneralEvenOddRational.h for details
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class ImplD, class ImplF>
    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
    private:
      typedef typename ImplD::FermionField FermionFieldD;
      typedef typename ImplF::FermionField FermionFieldF;
      FermionOperator<ImplD> & NumOpD;
      FermionOperator<ImplD> & DenOpD;
      FermionOperator<ImplF> & NumOpF;
      FermionOperator<ImplF> & DenOpF;
      Integer ReliableUpdateFreq;
    protected:
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
 	precisionChange(Uf, Ud);
 	NumOpD.ImportGauge(Ud);
 	DenOpD.ImportGauge(Ud);
 	NumOpF.ImportGauge(Uf);
 	DenOpF.ImportGauge(Uf);
      }
    public:
      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
 							      const RationalActionParams & p, Integer _ReliableUpdateFreq
 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
 								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
    };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@ -40,249 +40,31 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
-     
+      static RationalActionParams transcribe(const Params &in){
-      FermionOperator<Impl> & NumOp;// the basic operator
+	RationalActionParams out;
-      FermionOperator<Impl> & DenOp;// the basic operator
+	out.inv_pow = 2;
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
+	out.lo = in.lo;
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+	out.hi = in.hi;
 	out.MaxIter = in.MaxIter;
 	out.action_tolerance = out.md_tolerance = in.tolerance;
 	out.action_degree = out.md_degree = in.degree;
 	out.precision = in.precision;
 	out.BoundsCheckFreq = in.BoundsCheckFreq;
 	return out;
      }
    public:
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-					    FermionOperator<Impl>  &_DenOp, 
+							FermionOperator<Impl>  &_DenOp, 
-					    Params & p
+							const Params & p
-					    ) : 
+							) : 
-      NumOp(_NumOp), 
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}
      DenOp(_DenOp), 
      PhiOdd (_NumOp.FermionRedBlackGrid()),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
-	// MdagM^(+- 1/2)
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
 	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
 	//
 	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);	eta=eta*scale;
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
 	msCG_M(MdagM,etaOdd,tmp);
 	// VdagV^-1/4 MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
 	msCG_V(VdagV,tmp,PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	msCG_V(VdagV,PhiOdd,X);
 	// MdagM^-1/4 VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);
 	// Randomly apply rational bounds checks.
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
 	}
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int n_f  = PowerNegHalf.poles.size();
 	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)
 	for(int k=0;k<n_f;k++){
 	  ak = PowerNegHalf.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	for(int k=0;k<n_pv;k++){
          ak = PowerQuarter.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
      };
    };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@ -40,6 +40,8 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@ -83,16 +83,10 @@ NAMESPACE_BEGIN(Grid);
 	return sstream.str();
      } 
-      
+      //Access the fermion field
-      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+      const FermionField &getPhiOdd() const{ return PhiOdd; }
-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        //
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@ -100,12 +94,22 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);
        FermionField eta    (NumOp.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      void refresh(const GaugeField &U, const FermionField &eta) {
        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());
        gaussian(pRNG,eta);
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);
@ -125,8 +129,8 @@ NAMESPACE_BEGIN(Grid);
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);
-        PhiOdd =PhiOdd*scale;
+        //PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
+        //PhiEven=PhiEven*scale;
      };
--- a/Grid/qcd/gparity/Gparity.h
+++ b/Grid/qcd/gparity/Gparity.h
@ -0,0 +1,6 @@
 #ifndef GRID_GPARITY_H_
 #define GRID_GPARITY_H_
 #include<Grid/qcd/gparity/GparityFlavour.h>
 #endif
--- a/Grid/qcd/gparity/GparityFlavour.cc
+++ b/Grid/qcd/gparity/GparityFlavour.cc
@ -0,0 +1,34 @@
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
    GparityFlavour(GparityFlavour::Algebra::SigmaX),
    GparityFlavour(GparityFlavour::Algebra::SigmaY),
    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
    }};
 const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
  GparityFlavour(GparityFlavour::Algebra::Identity),
  GparityFlavour(GparityFlavour::Algebra::SigmaX),
  GparityFlavour(GparityFlavour::Algebra::SigmaY),
  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
 }};
 const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
    "SigmaX",
    "MinusSigmaX",
    "SigmaY",
    "MinusSigmaY",
    "SigmaZ",
    "MinusSigmaZ",
    "Identity",
    "MinusIdentity",
    "ProjPlus",
    "MinusProjPlus",
    "ProjMinus",
    "MinusProjMinus"}};
 NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/GparityFlavour.h
+++ b/Grid/qcd/gparity/GparityFlavour.h
@ -0,0 +1,475 @@
 #ifndef GRID_QCD_GPARITY_FLAVOUR_H
 #define GRID_QCD_GPARITY_FLAVOUR_H
 //Support for flavour-matrix operations acting on the G-parity flavour index
 #include <array>
 NAMESPACE_BEGIN(Grid);
 class GparityFlavour {
  public:
    GRID_SERIALIZABLE_ENUM(Algebra, undef,
                           SigmaX, 0,
 			   MinusSigmaX, 1,
                           SigmaY, 2,
 			   MinusSigmaY, 3,
                           SigmaZ, 4,
 			   MinusSigmaZ, 5,
 			   Identity, 6,
 			   MinusIdentity, 7,
 			   ProjPlus, 8,
 			   MinusProjPlus, 9,
 			   ProjMinus, 10,
 			   MinusProjMinus, 11
 			   );
    static constexpr unsigned int nSigma = 12;
    static const std::array<const char *, nSigma>                name;
    static const std::array<const GparityFlavour, 3>             sigma_mu;
    static const std::array<const GparityFlavour, 6>            sigma_all;
    Algebra                                                      g;
  public:
  accelerator GparityFlavour(Algebra initg): g(initg) {}  
 };
 // 0 1  x   vector
 // 1 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(1);
  ret(1) = rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(1,0);
  ret(0,1) = rhs(1,1);
  ret(1,0) = rhs(0,0);
  ret(1,1) = rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,1);
  ret(0,1) = rhs(0,0);
  ret(1,0) = rhs(1,1);
  ret(1,1) = rhs(1,0);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(1);
  ret(1) = -rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(1,0);
  ret(0,1) = -rhs(1,1);
  ret(1,0) = -rhs(0,0);
  ret(1,1) = -rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,1);
  ret(0,1) = -rhs(0,0);
  ret(1,0) = -rhs(1,1);
  ret(1,1) = -rhs(1,0);
 };
 // 0 -i  x   vector
 // i 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesMinusI(rhs(1));
  ret(1) = timesI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(1,0));
  ret(0,1) = timesMinusI(rhs(1,1));
  ret(1,0) = timesI(rhs(0,0));
  ret(1,1) = timesI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(0,1));
  ret(0,1) = timesMinusI(rhs(0,0));
  ret(1,0) = timesI(rhs(1,1));
  ret(1,1) = timesMinusI(rhs(1,0));
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesI(rhs(1));
  ret(1) = timesMinusI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(1,0));
  ret(0,1) = timesI(rhs(1,1));
  ret(1,0) = timesMinusI(rhs(0,0));
  ret(1,1) = timesMinusI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(0,1));
  ret(0,1) = timesI(rhs(0,0));
  ret(1,0) = timesMinusI(rhs(1,1));
  ret(1,1) = timesI(rhs(1,0));
 };
 // 1 0  x   vector
 // 0 -1
 template<class vtype>
 accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 //G-parity flavour projection 1/2(1+\sigma_2)
 //1 -i
 //i  1
 template<class vtype>
 accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 //G-parity flavour projection 1/2(1-\sigma_2)
 //1 i
 //-i  1
 template<class vtype>
 accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
 {
  iVector<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    multFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    multFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    multFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    multFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    multFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    multFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    multFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    multFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    multFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    multFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    multFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    multFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    lmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    lmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    lmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    lmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    lmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    lmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    lmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    lmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    lmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    lmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    lmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    lmultFlavourMinusProjMinus(ret, arg); break;  
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    rmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    rmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    rmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    rmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    rmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    rmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    rmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    rmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    rmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    rmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    rmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    rmultFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 NAMESPACE_END(Grid);
 #endif // include guard
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@ -129,18 +129,10 @@ public:
    Runner(S);
  }
-  //////////////////////////////////////////////////////////////////
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
-
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
-private:
+  void initializeGaugeFieldAndRNGs(Field &U){
-  template <class SmearingPolicy>
+    if(!Resources.haveRNGs()) Resources.AddRNGs();
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Resources.AddRNGs();
    Field U(UGrid);
    // Can move this outside?
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    if (Parameters.StartingType == "HotStart") {
      // Hot start
@ -159,14 +151,40 @@ private:
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
    } else if (Parameters.StartingType == "CheckpointStartReseed") {
      // Same as CheckpointRestart but reseed the RNGs using the fixed integer seeding used for ColdStart and HotStart
      // Useful for creating new evolution streams from an existing stream
      // WARNING: Unfortunately because the checkpointer doesn't presently allow us to separately restore the RNG and gauge fields we have to load
      // an existing RNG checkpoint first; make sure one is available and named correctly
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
      Resources.SeedFixedIntegers();      
    } else {
      // others
      std::cout << GridLogError << "Unrecognized StartingType\n";
      std::cout
 	<< GridLogError
-	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart, CheckpointStartReseed]\n";
      exit(1);
    }
  }
  //////////////////////////////////////////////////////////////////
 private:
  template <class SmearingPolicy>
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Field U(UGrid);
    initializeGaugeFieldAndRNGs(U);
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    Smearing.set_Field(U);
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@ -115,21 +115,21 @@ private:
    random(sRNG, rn_test);
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
+    std::cout << GridLogHMC << "exp(-dH) = " << prob
              << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return false;
    }
@ -145,7 +145,7 @@ private:
    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);
    TheIntegrator.integrate(U);
@ -165,7 +165,7 @@ private:
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+    std::cout << GridLogHMC << "Total H after trajectory  = " << H1
 	      << "  dH = " << H1 - H0 << "\n";
    std::cout.precision(current_precision);
@ -196,9 +196,9 @@ public:
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
      double t0=usecond();
@ -207,10 +207,10 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
      }
      if (accept)
@ -219,7 +219,7 @@ public:
      double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
      for (int obs = 0; obs < Observables.size(); obs++) {
@ -228,7 +228,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }
--- a/Grid/qcd/hmc/HMCModules.h
+++ b/Grid/qcd/hmc/HMCModules.h
@ -80,7 +80,9 @@ public:
      std::cout << GridLogError << "Seeds not initialized" << std::endl;
      exit(1);
    }
    std::cout << GridLogMessage << "Reseeding serial RNG with seed vector " << SerialSeeds << std::endl;
    sRNG_.SeedFixedIntegers(SerialSeeds);
    std::cout << GridLogMessage << "Reseeding parallel RNG with seed vector " << ParallelSeeds << std::endl;
    pRNG_->SeedFixedIntegers(ParallelSeeds);
  }
 };
--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@ -226,6 +226,9 @@ public:
  //////////////////////////////////////////////////////
  // Random number generators
  //////////////////////////////////////////////////////
  //Return true if the RNG objects have been instantiated
  bool haveRNGs() const{ return have_RNG; }
  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -136,8 +136,14 @@ protected:
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
+
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real max_force_abs = std::sqrt(maxLocalNorm2(force));
      Real max_impulse_abs = max_force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << " Max force: " << max_force_abs << " Time step: " << ep << " Impulse average: " << impulse_abs << " Max impulse: " << max_impulse_abs << std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
@ -249,15 +255,19 @@ public:
  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
    assert(P.Grid() == U.Grid());
-    std::cout << GridLogIntegrator << "Integrator refresh\n";
+    std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;
    std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
    FieldImplementation::generate_momenta(P, sRNG, pRNG);
    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
    // of the Metropolis
    std::cout << GridLogIntegrator << "Updating smeared fields" << std::endl;
    Smearer.set_Field(U);
    // Set the (eventual) representations gauge fields
    std::cout << GridLogIntegrator << "Updating representations" << std::endl;
    Representations.update(U);
    // The Smearer is attached to a pointer of the gauge field
@ -267,6 +277,7 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
 	std::cout << GridLogIntegrator << "Refreshing integrator level " << level << " index " << actionID << std::endl;
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
      }
--- a/Grid/qcd/observables/topological_charge.h
+++ b/Grid/qcd/observables/topological_charge.h
@ -99,7 +99,7 @@ public:
 	// using wilson flow by default here
 	WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 	WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
-	Real T0   = WF.energyDensityPlaquette(Usmear);
+	Real T0   = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
 	std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 		  << "T0                : [ " << traj << " ] "<< T0 << std::endl;
      }
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@ -7,6 +7,7 @@ Source file: ./lib/qcd/modules/plaquette.h
 Copyright (C) 2017
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -33,28 +34,44 @@ NAMESPACE_BEGIN(Grid);
 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
 public:
  //Store generic measurements to take during smearing process using std::function
  typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
 private:
  unsigned int Nstep;
-  unsigned int measure_interval;
+  RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step
-  mutable RealD epsilon, taus;
+ 
-
+  std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency
  mutable WilsonGaugeAction<Gimpl> SG;
-  void evolve_step(typename Gimpl::GaugeField&) const;
+  //Evolve the gauge field by 1 step and update tau
-  void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
+  void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
-  RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
+  //Evolve the gauge field by 1 step and update tau and the current time step eps
  void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const;
 public:
  INHERIT_GIMPL_TYPES(Gimpl)
  void resetActions(){ functions.clear(); }
  void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
  //Set the class to perform the default measurements: 
  //the plaquette energy density every step
  //the plaquette topological charge every 'topq_meas_interval' steps
  //and output to stdout
  void setDefaultMeasurements(int topq_meas_interval = 1);
  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
  Nstep(Nstep),
    epsilon(epsilon),
    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
    assert(epsilon > 0.0);
    LogMessage();
    setDefaultMeasurements(interval);
  }
  void LogMessage() {
@ -73,9 +90,29 @@ public:
    // undefined for WilsonFlow
  }
-  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
+  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const;
-  RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
+
-  RealD energyDensityPlaquette(const GaugeField& U) const;
+  //Compute t^2 <E(t)> for time t from the plaquette
  static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
  //Compute t^2 <E(t)> for time t from the 1x1 cloverleaf form
  //t is the Wilson flow time
  static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U);
  //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps
  //The smeared field is output as V
  std::vector<RealD> flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1);
  //Version that does not return the smeared field
  std::vector<RealD> flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1);
  //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps
  //The smeared field is output as V
  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1);
  //Version that does not return the smeared field
  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
 };
@ -83,7 +120,7 @@ public:
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
+void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
  GaugeField Z(U.Grid());
  GaugeField tmp(U.Grid());
  SG.deriv(U, Z);
@ -99,12 +136,13 @@ void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
  tau += epsilon;
 }
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
+void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{
-  if (maxTau - taus < epsilon){
+  if (maxTau - tau < eps){
-    epsilon = maxTau-taus;
+    eps = maxTau-tau;
  }
  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
@ -114,95 +152,151 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
+  Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0
  Z *= -17.0/8.0;
  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
+  Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
  Z *= -4.0/3.0;
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2
  // Ramos 
-  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
+  Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
  RealD diff = norm2(diffU);
  // adjust integration step
-  taus += epsilon;
+  tau += eps;
  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
-  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
+  eps = eps*0.95*std::pow(1e-4/diff,1./3.);
  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 }
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
+RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
-  RealD td = tau(step);
+  static WilsonGaugeAction<Gimpl> SG(3.0);
-  return 2.0 * td * td * SG.S(U)/U.Grid()->gSites();
+  return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
 }
 //Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
 template <class Gimpl>
 RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  assert(Nd == 4);
  //E = 1/2 tr( F_munu F_munu )
  //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
  //F_01 F_02 F_03   F_12 F_13  F_23
  GaugeMat F(U.Grid());
  LatticeComplexD R(U.Grid());
  R = Zero();
  for(int mu=0;mu<3;mu++){
    for(int nu=mu+1;nu<4;nu++){
      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
      R = R + trace(F*F);
    }
  }
  ComplexD out = sum(R);
  out = t*t*out / RealD(U.Grid()->gSites());
  return -real(out); //minus sign necessary for +ve energy
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
  std::vector<RealD> out;
  resetActions();
  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
      std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl;
      out.push_back( energyDensityPlaquette(t,U) );
    });      
  smear(V,U);
  return out;
 }
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
-  return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites();
+  GaugeField V(U);
  return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
  std::vector<RealD> out;
  resetActions();
  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
      std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl;
      out.push_back( energyDensityCloverleaf(t,U) );
    });      
  smear(V,U);
  return out;
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
  GaugeField V(U);
  return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
 }
 //#define WF_TIMING 
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
+void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
  out = in;
-  for (unsigned int step = 1; step <= Nstep; step++) {
+  RealD taus = 0.;
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
-    evolve_step(out);
+    evolve_step(out, taus);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
 #ifdef WF_TIMING
    std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+    //Perform measurements
-		  << step << "  " << tau(step) << "  " 
+    for(auto const &meas : functions)
-	      << energyDensityPlaquette(step,out) << std::endl;
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
    if( step % measure_interval == 0){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
 		<< step << "  " 
 		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
    }
  }
 }
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
+void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{
  out = in;
-  taus = epsilon;
+  RealD taus = 0.;
  RealD eps = epsilon;
  unsigned int step = 0;
  do{
    step++;
    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
-    evolve_step_adaptive(out, maxTau);
+    evolve_step_adaptive(out, taus, eps, maxTau);
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+    //Perform measurements
-		  << step << "  " << taus << "  "
+    for(auto const &meas : functions)
-	      << energyDensityPlaquette(out) << std::endl;
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
    if( step % measure_interval == 0){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
 		<< step << "  " 
 		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
    }
  } while (taus < maxTau);
 }
 template <class Gimpl>
 void WilsonFlow<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
    });
  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
    });
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@ -88,6 +88,12 @@ namespace PeriodicBC {
    return CovShiftBackward(Link,mu,arg);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  template<class gauge> Lattice<gauge>
  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
  {
    return Cshift(Link, mu, shift);
  }
 }
@ -158,6 +164,9 @@ namespace ConjugateBC {
    //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
  //       = U^T_\mu(L-1)  | x_\mu == 0
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
    GridBase *grid = Link.Grid();
@ -176,6 +185,9 @@ namespace ConjugateBC {
    return Link;
  }
  //Out(x) = S_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = S*_\mu(0)  | x_\mu == L-1
  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
@ -208,6 +220,35 @@ namespace ConjugateBC {
    return CovShiftBackward(Link,mu,arg);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = U*_\mu(0)  | x_\mu == L-1
  //shift = -1
  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
  //       = U*_\mu(L-1)  | x_\mu == 0
  template<class gauge> Lattice<gauge>
  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
  {
    GridBase *grid = Link.Grid();
    int Lmu = grid->GlobalDimensions()[mu] - 1;
    Lattice<iScalar<vInteger>> coor(grid);
    LatticeCoordinate(coor, mu);
    Lattice<gauge> tmp(grid);
    if(shift == 1){
      tmp = Cshift(Link, mu, 1);
      tmp = where(coor == Lmu, conjugate(tmp), tmp);
      return tmp;
    }else if(shift == -1){
      tmp = Link;
      tmp = where(coor == Lmu, conjugate(tmp), tmp);
      return Cshift(tmp, mu, -1);
    }else assert(0 && "Invalid shift value");
    return tmp; //shuts up the compiler fussing about the return type
  }
 }
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@ -40,27 +40,46 @@ public:
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
-  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
+  //A_\mu(x) = -i Ta(U_\mu(x) )   where Ta(U) = 1/2( U - U^dag ) - 1/2N tr(U - U^dag)  is the traceless antihermitian part. This is an O(A^3) approximation to the logarithm of U
-    for(int mu=0;mu<Nd;mu++){
+  static void GaugeLinkToLieAlgebraField(const GaugeMat &U, GaugeMat &A) {
-      Complex cmi(0.0,-1.0);
+    Complex cmi(0.0,-1.0);
-      A[mu] = Ta(U[mu]) * cmi;
+    A = Ta(U) * cmi;
    }
  }
-  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu,int orthog) {
+  
  //The derivative of the Lie algebra field
  static void DmuAmu(const std::vector<GaugeMat> &U, GaugeMat &dmuAmu,int orthog) {
    GridBase* grid = U[0].Grid();
    GaugeMat Ax(grid);
    GaugeMat Axm1(grid);
    GaugeMat Utmp(grid);
    dmuAmu=Zero();
    for(int mu=0;mu<Nd;mu++){
      if ( mu != orthog ) {
-	dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+	//Rather than define functionality to work out how the BCs apply to A_\mu we simply use the BC-aware Cshift to the gauge links and compute A_\mu(x) and A_\mu(x-1) separately
 	//Ax = A_\mu(x)
 	GaugeLinkToLieAlgebraField(U[mu], Ax);
 	//Axm1 = A_\mu(x_\mu-1)
 	Utmp = Gimpl::CshiftLink(U[mu], mu, -1);
 	GaugeLinkToLieAlgebraField(Utmp, Axm1);
 	//Derivative
 	dmuAmu = dmuAmu + Ax - Axm1;
      }
    }
  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
+  //Fix the gauge field Umu
  //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
-    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
+    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
  }
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
+
  //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
    GridBase *grid = Umu.Grid();
@ -122,29 +141,24 @@ public:
      }
    }
-    std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
+    assert(0 && "Gauge fixing did not converge within the specified number of iterations");
    if (err_on_no_converge) assert(0);
  };
-  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
-
+    ExpiAlphaDmuAmu(U,g,alpha,dmuAmu,orthog);
    GaugeLinkToLieAlgebraField(U,A);
    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu,orthog);
    Real vol = grid->gSites();
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);
    return trG;
  }
-  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
@ -159,11 +173,7 @@ public:
    GaugeMat g(grid);
    GaugeMat dmuAmu_p(grid);
-    std::vector<GaugeMat> A(Nd,grid);
+    DmuAmu(U,dmuAmu,orthog);
    GaugeLinkToLieAlgebraField(U,A);
    DmuAmu(A,dmuAmu,orthog);
    std::vector<int> mask(Nd,1);
    for(int mu=0;mu<Nd;mu++) if (mu==orthog) mask[mu]=0;
@ -207,16 +217,16 @@ public:
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);
    return trG;
  }
-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &U,GaugeMat &g, Real alpha, GaugeMat &dmuAmu,int orthog) {
    GridBase *grid = g.Grid();
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
-    DmuAmu(A,dmuAmu,orthog);
+    DmuAmu(U,dmuAmu,orthog);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
  }  
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -694,32 +694,32 @@ public:
 * Adjoint rep gauge xform
 */
-  template<typename GaugeField,typename GaugeMat>
+  template<typename Gimpl>
-  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
+  static void GaugeTransform(typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = Umu.Grid();
    conformable(grid,g.Grid());
-    GaugeMat U(grid);
+    typename Gimpl::GaugeLinkField U(grid);
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
      U= PeekIndex<LorentzIndex>(Umu,mu);
-      U = g*U*Cshift(ag, mu, 1);
+      U = g*U*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
      PokeIndex<LorentzIndex>(Umu,U,mu);
    }
  }
-  template<typename GaugeMat>
+  template<typename Gimpl>
-  static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
+  static void GaugeTransform( std::vector<typename Gimpl::GaugeLinkField> &U, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = g.Grid();
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = g*U[mu]*Cshift(ag, mu, 1);
+      U[mu] = g*U[mu]*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
    }
  }
-  template<typename GaugeField,typename GaugeMat>
+  template<typename Gimpl>
-  static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){
+  static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    LieRandomize(pRNG,g,1.0);
-    GaugeTransform(Umu,g);
+    GaugeTransform<Gimpl>(Umu,g);
  }
  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -125,6 +125,57 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // sum over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static void siteSpatialPlaquette(ComplexField &Plaq,
                            const std::vector<GaugeMat> &U) {
    ComplexField sitePlaq(U[0].Grid());
    Plaq = Zero();
    for (int mu = 1; mu < Nd-1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
        Plaq = Plaq + sitePlaq;
      }
    }
  }
  ////////////////////////////////////
  // sum over all x,y,z and over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static std::vector<RealD> timesliceSumSpatialPlaquette(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(Nd, Umu.Grid());
    // inefficient here
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    ComplexField Plaq(Umu.Grid());
    siteSpatialPlaquette(Plaq, U);
    typedef typename ComplexField::scalar_object sobj;
    std::vector<sobj> Tq;
    sliceSum(Plaq, Tq, Nd-1);
    std::vector<Real> out(Tq.size());
    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
    return out;
  }
  //////////////////////////////////////////////////
  // average over all x,y,z and over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static std::vector<RealD> timesliceAvgSpatialPlaquette(const GaugeLorentz &Umu) {
    std::vector<RealD> sumplaq = timesliceSumSpatialPlaquette(Umu);
    int Lt = Umu.Grid()->FullDimensions()[Nd-1];
    assert(sumplaq.size() == Lt);
    double vol = Umu.Grid()->gSites() / Lt;
    double faces = (1.0 * (Nd - 1)* (Nd - 2)) / 2.0;
    for(int t=0;t<Lt;t++)
      sumplaq[t] = sumplaq[t] / vol / faces / Nc; // Nd , Nc dependent... FIXME
    return sumplaq;
  }
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
@ -164,7 +215,7 @@ public:
    double vol = Umu.Grid()->gSites();
-    return p.real() / vol / (4.0 * Nc ) ;
+    return p.real() / vol / 4.0 / 3.0;
  };
  //////////////////////////////////////////////////
@ -362,11 +413,11 @@ public:
    GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
    GaugeMat vu = v*u;
      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
-      FS = (u*v + Cshift(vu, mu, -1));
+      FS = (u*v + Gimpl::CshiftLink(vu, mu, -1));
      FS = 0.125*(FS - adj(FS));
  }
-  static Real TopologicalCharge(GaugeLorentz &U){
+  static Real TopologicalCharge(const GaugeLorentz &U){
    // 4d topological charge
    assert(Nd==4);
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
@ -389,6 +440,203 @@ public:
  }
  //Clover-leaf Wilson loop combination for arbitrary mu-extent M and nu extent N,  mu >= nu
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 7  for 1x2 Wilson loop    
  //Clockwise ordering
  static void CloverleafMxN(GaugeMat &FS, const GaugeMat &Umu, const GaugeMat &Unu, int mu, int nu, int M, int N){  
 #define Fmu(A) Gimpl::CovShiftForward(Umu, mu, A)
 #define Bmu(A) Gimpl::CovShiftBackward(Umu, mu, A)
 #define Fnu(A) Gimpl::CovShiftForward(Unu, nu, A)
 #define Bnu(A) Gimpl::CovShiftBackward(Unu, nu, A)
 #define FmuI Gimpl::CovShiftIdentityForward(Umu, mu)
 #define BmuI Gimpl::CovShiftIdentityBackward(Umu, mu)
 #define FnuI Gimpl::CovShiftIdentityForward(Unu, nu)
 #define BnuI Gimpl::CovShiftIdentityBackward(Unu, nu)
    //Upper right loop
    GaugeMat tmp = BmuI;
    for(int i=1;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    FS = tmp;
    //Upper left loop
    tmp = BnuI;
    for(int j=1;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    FS = FS + tmp;
    //Lower right loop
    tmp = FnuI;
    for(int j=1;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    FS = FS + tmp;
    //Lower left loop
    tmp = FmuI;
    for(int i=1;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    FS = FS + tmp;
 #undef Fmu
 #undef Bmu
 #undef Fnu
 #undef Bnu
 #undef FmuI
 #undef BmuI
 #undef FnuI
 #undef BnuI
  }
  //Field strength from MxN Wilson loop
  //Note F_numu = - F_munu
  static void FieldStrengthMxN(GaugeMat &FS, const GaugeLorentz &U, int mu, int nu, int M, int N){  
    GaugeMat Umu = PeekIndex<LorentzIndex>(U, mu);
    GaugeMat Unu = PeekIndex<LorentzIndex>(U, nu);
    if(M == N){
      GaugeMat F(Umu.Grid());
      CloverleafMxN(F, Umu, Unu, mu, nu, M, N);
      FS = 0.125 * ( F - adj(F) );
    }else{
      //Average over both orientations
      GaugeMat horizontal(Umu.Grid()), vertical(Umu.Grid());
      CloverleafMxN(horizontal, Umu, Unu, mu, nu, M, N);
      CloverleafMxN(vertical, Umu, Unu, mu, nu, N, M);
      FS = 0.0625 * ( horizontal - adj(horizontal) + vertical - adj(vertical) );
    }
  }
  //Topological charge contribution from MxN Wilson loops
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
  //output is the charge by timeslice: sum over timeslices to obtain the total
  static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
    assert(Nd == 4);
    std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
    //Note F_numu = - F_munu
    //hence we only need to loop over mu,nu,rho,sigma that aren't related by permuting mu,nu  or rho,sigma
    //Use nu > mu
    for(int mu=0;mu<Nd-1;mu++){
      for(int nu=mu+1; nu<Nd; nu++){
 	F[mu][nu] = new GaugeMat(U.Grid());
 	FieldStrengthMxN(*F[mu][nu], U, mu, nu, M, N);
      }
    }
    Real coeff = -1./(32 * M_PI*M_PI * M*M * N*N); //overall sign to match CPS and Grid conventions, possibly related to time direction = 3 vs 0
    static const int combs[3][4] = { {0,1,2,3}, {0,2,1,3}, {0,3,1,2} };
    static const int signs[3] = { 1, -1, 1 }; //epsilon_{mu nu rho sigma}
    ComplexField fsum(U.Grid());
    fsum = Zero();
    for(int c=0;c<3;c++){
      int mu = combs[c][0], nu = combs[c][1], rho = combs[c][2], sigma = combs[c][3];
      int eps = signs[c];
      fsum = fsum + (8. * coeff * eps) * trace( (*F[mu][nu]) * (*F[rho][sigma]) ); 
    }
    for(int mu=0;mu<Nd-1;mu++)
      for(int nu=mu+1; nu<Nd; nu++)
 	delete F[mu][nu];
    typedef typename ComplexField::scalar_object sobj;
    std::vector<sobj> Tq;
    sliceSum(fsum, Tq, Nd-1);
    std::vector<Real> out(Tq.size());
    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
    return out;
  }
  static Real TopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
    std::vector<Real> Tq = TimesliceTopologicalChargeMxN(U,M,N);
    Real out(0);
    for(int t=0;t<Tq.size();t++) out += Tq[t];
    return out;
  }
  //Generate the contributions to the 5Li topological charge from Wilson loops of the following sizes
  //Use coefficients from hep-lat/9701012
  //1x1 : c1=(19.-55.*c5)/9.
  //2x2 : c2=(1-64.*c5)/9.
  //1x2 : c3=(-64.+640.*c5)/45.
  //1x3 : c4=1./5.-2.*c5
  //3x3 : c5=1./20.
  //Output array outer index contains the loops in the above order
  //Inner index is the time coordinate
  static std::vector<std::vector<Real> > TimesliceTopologicalCharge5LiContributions(const GaugeLorentz &U){
    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };       
    std::vector<std::vector<Real> > out(5);
    for(int i=0;i<5;i++){	
      out[i] = TimesliceTopologicalChargeMxN(U,exts[i][0],exts[i][1]);
    }
    return out;
  }   
  static std::vector<Real> TopologicalCharge5LiContributions(const GaugeLorentz &U){   
    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
    std::vector<Real> out(5);
    std::cout << GridLogMessage << "Computing topological charge" << std::endl;
    for(int i=0;i<5;i++){
      out[i] = TopologicalChargeMxN(U,exts[i][0],exts[i][1]);
      std::cout << GridLogMessage << exts[i][0] << "x" << exts[i][1] << " Wilson loop contribution " << out[i] << std::endl;
    }
    return out;
  }
  //Compute the 5Li topological charge
  static std::vector<Real> TimesliceTopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
    double c5=1./20.;
    double c4=1./5.-2.*c5;
    double c3=(-64.+640.*c5)/45.;
    double c2=(1-64.*c5)/9.;
    double c1=(19.-55.*c5)/9.;
    int Lt = loops[0].size();
    std::vector<Real> out(Lt,0.);
    for(int t=0;t<Lt;t++)
      out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
    return out;
  }
  static Real TopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
    Real Q = 0.;
    for(int t=0;t<Qt.size();t++) Q += Qt[t];
    std::cout << GridLogMessage << "5Li Topological charge: " << Q << std::endl;
    return Q;
  }
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
--- a/Grid/sitmo_rng/README
+++ b/Grid/sitmo_rng/README
--- a/Grid/random/gaussian.h
+++ b/Grid/random/gaussian.h
@ -0,0 +1,200 @@
 // -*- C++ -*-
 //===--------------------------- random -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // Peter Boyle: Taken from libc++ in Clang/LLVM.
 // Reason is that libstdc++ and clang differ in their return order in the normal_distribution / box mueller type step.
 // standardise on one and call it "gaussian_distribution".
 #pragma once
 #include <cstddef>
 #include <cstdint>
 #include <cmath>
 #include <type_traits>
 #include <initializer_list>
 #include <limits>
 #include <algorithm>
 #include <numeric>
 #include <vector>
 #include <string>
 #include <istream>
 #include <ostream>
 #include <random>
 // normal_distribution -> gaussian distribution
 namespace Grid {
 template<class _RealType = double>
 class  gaussian_distribution
 {
 public:
    // types
    typedef _RealType result_type;
    class param_type
    {
        result_type __mean_;
        result_type __stddev_;
    public:
        typedef gaussian_distribution distribution_type;
        strong_inline
        explicit param_type(result_type __mean = 0, result_type __stddev = 1)
            : __mean_(__mean), __stddev_(__stddev) {}
        strong_inline
        result_type mean() const {return __mean_;}
        strong_inline
        result_type stddev() const {return __stddev_;}
        friend strong_inline
            bool operator==(const param_type& __x, const param_type& __y)
            {return __x.__mean_ == __y.__mean_ && __x.__stddev_ == __y.__stddev_;}
        friend strong_inline
            bool operator!=(const param_type& __x, const param_type& __y)
            {return !(__x == __y);}
    };
 private:
    param_type __p_;
    result_type _V_;
    bool _V_hot_;
 public:
    // constructors and reset functions
    strong_inline
    explicit gaussian_distribution(result_type __mean = 0, result_type __stddev = 1)
        : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
    strong_inline
    explicit gaussian_distribution(const param_type& __p)
        : __p_(__p), _V_hot_(false) {}
    strong_inline
    void reset() {_V_hot_ = false;}
    // generating functions
    template<class _URNG>
        strong_inline
        result_type operator()(_URNG& __g)
        {return (*this)(__g, __p_);}
    template<class _URNG> result_type operator()(_URNG& __g, const param_type& __p);
    // property functions
    strong_inline
    result_type mean() const {return __p_.mean();}
    strong_inline
    result_type stddev() const {return __p_.stddev();}
    strong_inline
    param_type param() const {return __p_;}
    strong_inline
    void param(const param_type& __p) {__p_ = __p;}
    strong_inline
    result_type min() const {return -std::numeric_limits<result_type>::infinity();}
    strong_inline
    result_type max() const {return std::numeric_limits<result_type>::infinity();}
    friend strong_inline
        bool operator==(const gaussian_distribution& __x,
                        const gaussian_distribution& __y)
        {return __x.__p_ == __y.__p_ && __x._V_hot_ == __y._V_hot_ &&
                (!__x._V_hot_ || __x._V_ == __y._V_);}
    friend strong_inline
        bool operator!=(const gaussian_distribution& __x,
                        const gaussian_distribution& __y)
        {return !(__x == __y);}
    template <class _CharT, class _Traits, class _RT>
    friend
    std::basic_ostream<_CharT, _Traits>&
    operator<<(std::basic_ostream<_CharT, _Traits>& __os,
               const gaussian_distribution<_RT>& __x);
    template <class _CharT, class _Traits, class _RT>
    friend
    std::basic_istream<_CharT, _Traits>&
    operator>>(std::basic_istream<_CharT, _Traits>& __is,
               gaussian_distribution<_RT>& __x);
 };
 template <class _RealType>
 template<class _URNG>
 _RealType
 gaussian_distribution<_RealType>::operator()(_URNG& __g, const param_type& __p)
 {
    result_type _Up;
    if (_V_hot_)
    {
        _V_hot_ = false;
        _Up = _V_;
    }
    else
    {
        std::uniform_real_distribution<result_type> _Uni(-1, 1);
        result_type __u;
        result_type __v;
        result_type __s;
        do
        {
            __u = _Uni(__g);
            __v = _Uni(__g);
            __s = __u * __u + __v * __v;
        } while (__s > 1 || __s == 0);
        result_type _Fp = std::sqrt(-2 * std::log(__s) / __s);
        _V_ = __v * _Fp;
        _V_hot_ = true;
        _Up = __u * _Fp;
    }
    return _Up * __p.stddev() + __p.mean();
 }
 template <class _CharT, class _Traits, class _RT>
 std::basic_ostream<_CharT, _Traits>&
 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
           const gaussian_distribution<_RT>& __x)
 {
    auto __save_flags = __os.flags();
    __os.flags(std::ios_base::dec | std::ios_base::left | std::ios_base::fixed |
               std::ios_base::scientific);
    _CharT __sp = __os.widen(' ');
    __os.fill(__sp);
    __os << __x.mean() << __sp << __x.stddev() << __sp << __x._V_hot_;
    if (__x._V_hot_)
        __os << __sp << __x._V_;
    __os.flags(__save_flags);
    return __os;
 }
 template <class _CharT, class _Traits, class _RT>
 std::basic_istream<_CharT, _Traits>&
 operator>>(std::basic_istream<_CharT, _Traits>& __is,
           gaussian_distribution<_RT>& __x)
 {
    typedef gaussian_distribution<_RT> _Eng;
    typedef typename _Eng::result_type result_type;
    typedef typename _Eng::param_type param_type;
    auto __save_flags = __is.flags();
    __is.flags(std::ios_base::dec | std::ios_base::skipws);
    result_type __mean;
    result_type __stddev;
    result_type _Vp = 0;
    bool _V_hot = false;
    __is >> __mean >> __stddev >> _V_hot;
    if (_V_hot)
        __is >> _Vp;
    if (!__is.fail())
    {
        __x.param(param_type(__mean, __stddev));
        __x._V_hot_ = _V_hot;
        __x._V_ = _Vp;
    }
    __is.flags(__save_flags);
    return __is;
 }
 }
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@ -208,5 +208,46 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 }
 //////////////////////////////////////////////////////////////////////////////////
 //Copy a single lane of a SIMD tensor type from one object to another
 //Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
 ///////////////////////////////////////////////////////////////////////////////////
 template<class vobjOut, class vobjIn>
 accelerator_inline 
 void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
 {
  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  typedef typename vobjOut::vector_type ovector_type;  
  typedef typename vobjIn::vector_type ivector_type;  
  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
  typedef typename vobjOut::scalar_type oscalar_type;  
  typedef typename vobjIn::scalar_type iscalar_type;  
  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
  typedef oextract_type * opointer;
  typedef iextract_type * ipointer;
  constexpr int oNsimd=ovector_type::Nsimd();
  constexpr int iNsimd=ivector_type::Nsimd();
  iscalar_type itmp;
  oscalar_type otmp;
  opointer __restrict__  op = (opointer)&vecOut;
  ipointer __restrict__  ip = (ipointer)&vecIn;
  for(int w=0;w<owords;w++){
    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
    otmp = itmp; //potential precision change
    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
  }
 }
 NAMESPACE_END(Grid);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -206,7 +206,8 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
  return ptr;
 };
@ -216,15 +217,47 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
  return ptr;
 };
-inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
+inline void acceleratorFreeShared(void *ptr){
-inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
+  auto err = cudaFree(ptr);
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+  if( err != cudaSuccess ) {
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+    printf(" cudaFree(Shared) failed %s \n",cudaGetErrorString(err)); fflush(stdout);
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 };
 inline void acceleratorFreeDevice(void *ptr){
  auto err = cudaFree(ptr);
  if( err != cudaSuccess ) {
    printf(" cudaFree(Device) failed %s \n",cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 };
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  {
  auto err = cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);
  if( err != cudaSuccess ) {
    printf(" cudaMemcpy(host->device) failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 }
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){
  auto err = cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);
  if( err != cudaSuccess ) {
    printf(" cudaMemcpy(device->host) failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) {
  auto err = cudaMemset(base,value,bytes);
  if( err != cudaSuccess ) {
    printf(" cudaMemSet failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 }
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
@ -481,10 +514,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
 #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
 accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
-
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 inline void acceleratorCopySynchronise(void) {};
 inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@ -72,20 +72,3 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_region                                       DO_PRAGMA(omp parallel)
 #define thread_critical                                     DO_PRAGMA(omp critical)
 #ifdef GRID_OMP
 inline void thread_bcopy(void *from, void *to,size_t bytes)
 {
  uint64_t *ufrom = (uint64_t *)from;
  uint64_t *uto   = (uint64_t *)to;
  assert(bytes%8==0);
  uint64_t words=bytes/8;
  thread_for(w,words,{
      uto[w] = ufrom[w];
  });
 }
 #else
 inline void thread_bcopy(void *from, void *to,size_t bytes)
 {
  bcopy(from,to,bytes);
 }
 #endif
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@ -0,0 +1,473 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/DWF2p1fIwasakiGparity.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //2+1f DWF+I ensemble with G-parity BCs
 //designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  RatQuoParameters, rat_quo_l,
 				  RatQuoParameters, rat_quo_s);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr){
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
 }
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "Params", user_params);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
   // Typedefs to simplify notation
  typedef GparityDomainWallFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityDomainWallFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.032;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  //Setup the Grids
  auto GridPtrD   = TheHMC.Resources.GetCartesian();
  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(GridPtrD);
  LatticeGaugeFieldF Uf(GridPtrF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light action
  /////////////////////////////////////////////////////////////
  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_l;
  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_l.precision= 60;
  rat_act_params_l.MaxIter  = 10000;
  user_params.rat_quo_l.Export(rat_act_params_l);
  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
  //DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
  Level1.push_back(&Quotient_l);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  //DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
  std::string lanc_params_l, lanc_params_s;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
    else if(sarg == "--eigenrange_l"){
      assert(i < argc-1);
      eigenrange_l=true;
      lanc_params_l = argv[i+1];
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
  }
  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
+++ b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
@ -0,0 +1,473 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/DWF2p1fIwasakiGparity.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //2+1f DWF+I ensemble with G-parity BCs
 //designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  RatQuoParameters, rat_quo_l,
 				  RatQuoParameters, rat_quo_s);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr){
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
 }
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "Params", user_params);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
   // Typedefs to simplify notation
  typedef GparityDomainWallFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityDomainWallFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.032;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  //Setup the Grids
  auto GridPtrD   = TheHMC.Resources.GetCartesian();
  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(GridPtrD);
  LatticeGaugeFieldF Uf(GridPtrF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light action
  /////////////////////////////////////////////////////////////
  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_l;
  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_l.precision= 60;
  rat_act_params_l.MaxIter  = 10000;
  user_params.rat_quo_l.Export(rat_act_params_l);
  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
  DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
  Level1.push_back(&Quotient_l);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
  std::string lanc_params_l, lanc_params_s;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
    else if(sarg == "--eigenrange_l"){
      assert(i < argc-1);
      eigenrange_l=true;
      lanc_params_l = argv[i+1];
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
  }
  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA.cc
@ -0,0 +1,765 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //We try to reproduce with G-parity BCs the 246 MeV 1.37 GeV ensemble
 //To speed things up we will use Mobius DWF with b+c=32/12 and Ls=12 to match the Ls=32 of the original
 //These parameters match those used in the 2020 K->pipi paper
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EOFAparameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
 				  OneFlavourRationalParams, rat_params,
 				  double, action_tolerance,
 				  double, action_mixcg_inner_tolerance,
 				  double, md_tolerance,
 				  double, md_mixcg_inner_tolerance);
  EOFAparameters() { 
    action_mixcg_inner_tolerance = 1e-8;
    action_tolerance = 1e-10;
    md_tolerance = 1e-8;
    md_mixcg_inner_tolerance = 1e-8;
    rat_params.lo = 0.1;
    rat_params.hi = 25.0;
    rat_params.MaxIter  = 10000;
    rat_params.tolerance= 1.0e-9;
    rat_params.degree   = 14;
    rat_params.precision= 50;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  EOFAparameters, eofa_l,
 				  RatQuoParameters, rat_quo_s,
 				  RatQuoParameters, rat_quo_DSDR);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 //action_or_md toggles checking the action (0), MD (1) or both (2) setups
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  PowerMethod<FermionFieldD> power_method;
  RealD lambda_max;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
  lambda_max = power_method(MdagM,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
  lambda_max = power_method(VdagV,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  if(action_or_md == 0 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  if(action_or_md == 1 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
 }
 template<typename FermionImplPolicy>
 void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
  typename FermionImplPolicy::FermionField eta(FGrid);
  RealD scale = std::sqrt(0.5);
  gaussian(rng,eta); eta = eta * scale;
  //Use the inbuilt check
  EOFA.refresh(latt, eta);
  EOFA.S(latt);
  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
 }
 template<typename FermionImplPolicy>
 class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
 }
 //Applications of M^{-1} cost the same as M for EOFA!
 template<typename FermionImplPolicy>
 class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
 }
 NAMESPACE_BEGIN(Grid);
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      MPCG.InnerTolerance = InnerTolerance;
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    {
      Grid::XmlWriter wr(param_file + ".templ");
      write(wr, "Params", user_params);
    }
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
  typedef GparityMobiusEOFAFermionD EOFAactionD;
  typedef GparityMobiusFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityMobiusEOFAFermionF EOFAactionF;
  typedef GparityMobiusFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 1.75;
  Real light_mass   = 0.0042; //240 MeV
  Real strange_mass = 0.045;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD mobius_scale = 32./12.; //b+c
  RealD mob_bmc = 1.0;
  RealD mob_b = (mobius_scale + mob_bmc)/2.;
  RealD mob_c = (mobius_scale - mob_bmc)/2.;
  //Setup the Grids
  auto UGridD   = TheHMC.Resources.GetCartesian();
  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(UGridD);
  LatticeGaugeFieldF Uf(UGridF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(1); //DSDR
  ActionLevel<HMCWrapper::Field> Level3(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light EOFA action
  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
  /////////////////////////////////////////////////////////////
  EOFAactionD LopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, light_mass, light_mass, pv_mass, 0.0, -1, M5, mob_b, mob_c, Params);
  EOFAactionF LopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, light_mass, light_mass, pv_mass, 0.0, -1, M5, mob_b, mob_c, Params);
  EOFAactionD RopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, pv_mass, light_mass, pv_mass, -1.0, 1, M5, mob_b, mob_c, Params);
  EOFAactionF RopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, pv_mass, light_mass, pv_mass, -1.0, 1, M5, mob_b, mob_c, Params);
  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
  EOFAschuropD linopL_D(LopD);
  EOFAschuropD linopR_D(RopD);
  EOFAschuropF linopL_F(LopF);
  EOFAschuropF linopR_F(RopF);
  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
  EOFA_mxCG ActionMCG_L(user_params.eofa_l.action_tolerance, 10000, 1000, UGridF, FrbGridF, LopF, LopD, linopL_F, linopL_D);
  ActionMCG_L.InnerTolerance = user_params.eofa_l.action_mixcg_inner_tolerance;
  EOFA_mxCG ActionMCG_R(user_params.eofa_l.action_tolerance, 10000, 1000, UGridF, FrbGridF, RopF, RopD, linopR_F, linopR_D);
  ActionMCG_R.InnerTolerance = user_params.eofa_l.action_mixcg_inner_tolerance;
  EOFA_mxCG DerivMCG_L(user_params.eofa_l.md_tolerance, 10000, 1000, UGridF, FrbGridF, LopF, LopD, linopL_F, linopL_D);
  DerivMCG_L.InnerTolerance = user_params.eofa_l.md_mixcg_inner_tolerance;
  EOFA_mxCG DerivMCG_R(user_params.eofa_l.md_tolerance, 10000, 1000, UGridF, FrbGridF, RopF, RopD, linopR_F, linopR_D);
  DerivMCG_R.InnerTolerance = user_params.eofa_l.md_mixcg_inner_tolerance;
  std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
  std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
  ConjugateGradient<FermionFieldD>      ActionCG(user_params.eofa_l.action_tolerance, 10000);
  ConjugateGradient<FermionFieldD>  DerivativeCG(user_params.eofa_l.md_tolerance, 10000);
  // ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicyD> EOFA(LopD, RopD, 
  // 								   ActionCG, ActionCG, ActionCG, 
  // 								   DerivativeCG, DerivativeCG, 
  // 								   user_params.eofa_l.rat_params, true);
  // ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicyD> EOFA(LopD, RopD, 
  // 								   ActionMCG_L, ActionMCG_R, 
  // 								   ActionMCG_L, ActionMCG_R, 
  // 								   DerivMCG_L, DerivMCG_R, 
  // 								   user_params.eofa_l.rat_params, true);
  ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFA(LopF, RopF,
 													LopD, RopD, 
 													ActionMCG_L, ActionMCG_R, 
 													ActionMCG_L, ActionMCG_R, 
 													DerivMCG_L, DerivMCG_R, 
 													user_params.eofa_l.rat_params, true);
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  ///////////////////////////////////
  // DSDR action
  ///////////////////////////////////
  RealD dsdr_mass=-1.8;   
  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
  RealD dsdr_epsilon_b = 0.5; 
  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
  RationalActionParams rat_act_params_DSDR;
  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_DSDR.precision= 60;
  rat_act_params_DSDR.MaxIter  = 10000;
  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
  Level2.push_back(&Quotient_DSDR);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool 
    tune_rhmc_s=false, eigenrange_s=false, 
    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
    check_eofa=false, 
    upper_bound_eofa=false, lower_bound_eofa(false);
  std::string lanc_params_s;
  std::string lanc_params_DSDR;
  int tune_rhmc_s_action_or_md;
  int tune_rhmc_DSDR_action_or_md;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_s"){
      assert(i < argc-1);
      tune_rhmc_s=true;
      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
    else if(sarg == "--tune_rhmc_DSDR"){
      assert(i < argc-1);
      tune_rhmc_DSDR=true;
      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_DSDR"){
      assert(i < argc-1);
      eigenrange_DSDR=true;
      lanc_params_DSDR = argv[i+1];
    }
    else if(sarg == "--check_eofa") check_eofa = true;
    else if(sarg == "--upper_bound_eofa") upper_bound_eofa = true;
    else if(sarg == "--lower_bound_eofa") lower_bound_eofa = true;
  }
  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
    std::cout << GridLogMessage << "Running checks" << std::endl;
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
    std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
    if(check_eofa) checkEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(upper_bound_eofa) upperBoundEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(lower_bound_eofa) lowerBoundEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
@ -0,0 +1,918 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //Production binary for the 40ID G-parity ensemble
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EOFAparameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
 				  OneFlavourRationalParams, rat_params,
 				  double, action_tolerance,
 				  double, action_mixcg_inner_tolerance,
 				  double, md_tolerance,
 				  double, md_mixcg_inner_tolerance);
  EOFAparameters() { 
    action_mixcg_inner_tolerance = 1e-8;
    action_tolerance = 1e-10;
    md_tolerance = 1e-8;
    md_mixcg_inner_tolerance = 1e-8;
    rat_params.lo = 1.0;
    rat_params.hi = 25.0;
    rat_params.MaxIter  = 50000;
    rat_params.tolerance= 1.0e-9;
    rat_params.degree   = 14;
    rat_params.precision= 50;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
 				  RealD, TrajectoryLength,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  std::vector<EOFAparameters>, eofa_l,
 				  RatQuoParameters, rat_quo_s,
 				  RatQuoParameters, rat_quo_DSDR);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
    TrajectoryLength = 1.0;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 50000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 //action_or_md toggles checking the action (0), MD (1) or both (2) setups
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  PowerMethod<FermionFieldD> power_method;
  RealD lambda_max;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
  lambda_max = power_method(MdagM,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
  lambda_max = power_method(VdagV,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  if(action_or_md == 0 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  if(action_or_md == 1 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
 }
 template<typename FermionImplPolicy>
 void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
  typename FermionImplPolicy::FermionField eta(FGrid);
  RealD scale = std::sqrt(0.5);
  gaussian(rng,eta); eta = eta * scale;
  //Use the inbuilt check
  EOFA.refresh(latt, eta);
  EOFA.S(latt);
  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
 }
 template<typename FermionImplPolicy>
 class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
 }
 //Applications of M^{-1} cost the same as M for EOFA!
 template<typename FermionImplPolicy>
 class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
 }
 NAMESPACE_BEGIN(Grid);
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      MPCG.InnerTolerance = InnerTolerance;
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD Tolerance;
    Integer MaxIterations;
    RealD Delta; //reliable update parameter
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
 								  RealD delta,
 								  Integer maxit, 
 								  GridBase* _sp_grid4, 
 								  GridBase* _sp_grid5, 
 								  FermionOperatorF &_FermOpF,
 								  FermionOperatorD &_FermOpD,
 								  SchurOperatorF   &_LinOpF,
 								  SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      Delta(delta),
      MaxIterations(maxit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5)
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  std::string serial_seeds = "1 2 3 4 5";
  std::string parallel_seeds = "6 7 8 9 10";
  int i=1;
  while(i < argc){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
      i+=2;
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
      i++;
    }else if(sarg == "--set_seeds"){ //set the rng seeds. Expects two vector args, e.g.  --set_seeds 1.2.3.4 5.6.7.8
      assert(i < argc-2);
      std::vector<int> tmp;
      GridCmdOptionIntVector(argv[i+1],tmp);
      {
 	std::stringstream ss;
 	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
 	ss << tmp.back();
 	serial_seeds = ss.str();
      }
      GridCmdOptionIntVector(argv[i+2],tmp);
      {
 	std::stringstream ss;
 	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
 	ss << tmp.back();
 	parallel_seeds = ss.str();
      }
      i+=3;
      std::cout << GridLogMessage << "Set serial seeds to " << serial_seeds << std::endl;
      std::cout << GridLogMessage << "Set parallel seeds to " << parallel_seeds << std::endl;
    }else{
      i++;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    {
      Grid::XmlWriter wr(param_file + ".templ");
      write(wr, "Params", user_params);
    }
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
  typedef GparityMobiusEOFAFermionD EOFAactionD;
  typedef GparityMobiusFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityMobiusEOFAFermionF EOFAactionF;
  typedef GparityMobiusFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  MD.name    = std::string("MinimumNorm2");
  // typedef ConjugateHMCRunnerD<ForceGradient> HMCWrapper;
  // MD.name    = std::string("ForceGradient");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = user_params.TrajectoryLength;
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = serial_seeds;
  RNGpar.parallel_seeds = parallel_seeds;
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  //aiming for ainv=1.723 GeV
  //                                  me         bob
  //Estimated  a(ml+mres) [40ID] = 0.001305    0.00131
  //           a(mh+mres) [40ID] = 0.035910    0.03529
  //Estimate Ls=12, b+c=2  mres~0.0011
  //1/24/2022 initial mres measurement gives mres=0.001,  adjusted light quark mass to 0.0003 from 0.0001
  const int Ls      = 12;
  Real beta         = 1.848;
  Real light_mass   = 0.0003;
  Real strange_mass = 0.0342;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD mobius_scale = 2.; //b+c
  RealD mob_bmc = 1.0;
  RealD mob_b = (mobius_scale + mob_bmc)/2.;
  RealD mob_c = (mobius_scale - mob_bmc)/2.;
  std::cout << GridLogMessage
 	    << "Ensemble parameters:" << std::endl
 	    << "Ls=" << Ls << std::endl
 	    << "beta=" << beta << std::endl
 	    << "light_mass=" << light_mass << std::endl
 	    << "strange_mass=" << strange_mass << std::endl
 	    << "mobius_scale=" << mobius_scale << std::endl;
  //Setup the Grids
  auto UGridD   = TheHMC.Resources.GetCartesian();
  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(UGridD);
  LatticeGaugeFieldF Uf(UGridF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
  /////////////////////////////////////////////////////////////
  // Light EOFA action
  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
  /////////////////////////////////////////////////////////////
  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
  int n_light_hsb = 5;
  assert(user_params.eofa_l.size() == n_light_hsb);
  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
  for(int i=0;i<n_light_hsb;i++){
    RealD iml = eofa_light_masses[i];
    RealD ipv = eofa_pv_masses[i];
    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
 #if 1
    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
 #else
    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
 #endif
    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
 							*LopD, *RopD, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*DerivMCG_L, *DerivMCG_R, 
 							user_params.eofa_l[i].rat_params, true);
    EOFA_pfactions[i] = EOFA;
    Level1.push_back(EOFA);
  }
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 50000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  ///////////////////////////////////
  // DSDR action
  ///////////////////////////////////
  RealD dsdr_mass=-1.8;   
  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
  RealD dsdr_epsilon_b = 0.5; 
  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
  RationalActionParams rat_act_params_DSDR;
  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_DSDR.precision= 60;
  rat_act_params_DSDR.MaxIter  = 50000;
  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
  Level2.push_back(&Quotient_DSDR);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool 
    tune_rhmc_s=false, eigenrange_s=false, 
    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
    check_eofa=false, 
    upper_bound_eofa=false, lower_bound_eofa(false);
  std::string lanc_params_s;
  std::string lanc_params_DSDR;
  int tune_rhmc_s_action_or_md;
  int tune_rhmc_DSDR_action_or_md;
  int eofa_which_hsb;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_s"){
      assert(i < argc-1);
      tune_rhmc_s=true;
      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
    else if(sarg == "--tune_rhmc_DSDR"){
      assert(i < argc-1);
      tune_rhmc_DSDR=true;
      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_DSDR"){
      assert(i < argc-1);
      eigenrange_DSDR=true;
      lanc_params_DSDR = argv[i+1];
    }
    else if(sarg == "--check_eofa"){
      assert(i < argc-1);
      check_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
    }
    else if(sarg == "--upper_bound_eofa"){
      assert(i < argc-1);
      upper_bound_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
    else if(sarg == "--lower_bound_eofa"){
      assert(i < argc-1);
      lower_bound_eofa = true;      
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
  }
  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
    std::cout << GridLogMessage << "Running checks" << std::endl;
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
    if(check_eofa){
      if(eofa_which_hsb >= 0){
 	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
 	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
      }else{
 	for(int i=0;i<n_light_hsb;i++){
 	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
 	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
 	}
      }
    }	  
    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
@ -0,0 +1,873 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //Production binary for the 40ID G-parity ensemble
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EOFAparameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
 				  OneFlavourRationalParams, rat_params,
 				  double, action_tolerance,
 				  double, action_mixcg_inner_tolerance,
 				  double, md_tolerance,
 				  double, md_mixcg_inner_tolerance);
  EOFAparameters() { 
    action_mixcg_inner_tolerance = 1e-8;
    action_tolerance = 1e-10;
    md_tolerance = 1e-8;
    md_mixcg_inner_tolerance = 1e-8;
    rat_params.lo = 1.0;
    rat_params.hi = 25.0;
    rat_params.MaxIter  = 10000;
    rat_params.tolerance= 1.0e-9;
    rat_params.degree   = 14;
    rat_params.precision= 50;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
 				  RealD, TrajectoryLength,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  std::vector<EOFAparameters>, eofa_l,
 				  RatQuoParameters, rat_quo_s,
 				  RatQuoParameters, rat_quo_DSDR);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
    TrajectoryLength = 1.0;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 //action_or_md toggles checking the action (0), MD (1) or both (2) setups
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  PowerMethod<FermionFieldD> power_method;
  RealD lambda_max;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
  lambda_max = power_method(MdagM,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
  lambda_max = power_method(VdagV,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  if(action_or_md == 0 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  if(action_or_md == 1 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
 }
 template<typename FermionImplPolicy>
 void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
  typename FermionImplPolicy::FermionField eta(FGrid);
  RealD scale = std::sqrt(0.5);
  gaussian(rng,eta); eta = eta * scale;
  //Use the inbuilt check
  EOFA.refresh(latt, eta);
  EOFA.S(latt);
  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
 }
 template<typename FermionImplPolicy>
 class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
 }
 //Applications of M^{-1} cost the same as M for EOFA!
 template<typename FermionImplPolicy>
 class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
 }
 NAMESPACE_BEGIN(Grid);
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      MPCG.InnerTolerance = InnerTolerance;
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD Tolerance;
    Integer MaxIterations;
    RealD Delta; //reliable update parameter
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
 								  RealD delta,
 								  Integer maxit, 
 								  GridBase* _sp_grid4, 
 								  GridBase* _sp_grid5, 
 								  FermionOperatorF &_FermOpF,
 								  FermionOperatorD &_FermOpD,
 								  SchurOperatorF   &_LinOpF,
 								  SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      Delta(delta),
      MaxIterations(maxit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5)
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    {
      Grid::XmlWriter wr(param_file + ".templ");
      write(wr, "Params", user_params);
    }
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
  typedef GparityMobiusEOFAFermionD EOFAactionD;
  typedef GparityMobiusFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityMobiusEOFAFermionF EOFAactionF;
  typedef GparityMobiusFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = user_params.TrajectoryLength;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  //aiming for ainv=2.068             me          Bob
  //Estimated  a(ml+mres) [48ID] = 0.001048    0.00104 
  //           a(mh+mres) [48ID] = 0.028847    0.02805
  //Estimate Ls=12, b+c=2  mres~0.0003
  const int Ls      = 12;
  Real beta         = 1.946;
  Real light_mass   = 0.00074;   //0.00104 - mres_approx;
  Real strange_mass = 0.02775;    //0.02805 - mres_approx
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD mobius_scale = 2.; //b+c
  RealD mob_bmc = 1.0;
  RealD mob_b = (mobius_scale + mob_bmc)/2.;
  RealD mob_c = (mobius_scale - mob_bmc)/2.;
  //Setup the Grids
  auto UGridD   = TheHMC.Resources.GetCartesian();
  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(UGridD);
  LatticeGaugeFieldF Uf(UGridF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
  /////////////////////////////////////////////////////////////
  // Light EOFA action
  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
  /////////////////////////////////////////////////////////////
  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
  int n_light_hsb = 5;
  assert(user_params.eofa_l.size() == n_light_hsb);
  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
  for(int i=0;i<n_light_hsb;i++){
    RealD iml = eofa_light_masses[i];
    RealD ipv = eofa_pv_masses[i];
    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
 #if 1
    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
 #else
    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
 #endif
    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
 							*LopD, *RopD, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*DerivMCG_L, *DerivMCG_R, 
 							user_params.eofa_l[i].rat_params, true);
    EOFA_pfactions[i] = EOFA;
    Level1.push_back(EOFA);
  }
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  ///////////////////////////////////
  // DSDR action
  ///////////////////////////////////
  RealD dsdr_mass=-1.8;   
  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
  RealD dsdr_epsilon_b = 0.5; 
  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
  RationalActionParams rat_act_params_DSDR;
  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_DSDR.precision= 60;
  rat_act_params_DSDR.MaxIter  = 10000;
  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
  Level2.push_back(&Quotient_DSDR);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool 
    tune_rhmc_s=false, eigenrange_s=false, 
    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
    check_eofa=false, 
    upper_bound_eofa=false, lower_bound_eofa(false);
  std::string lanc_params_s;
  std::string lanc_params_DSDR;
  int tune_rhmc_s_action_or_md;
  int tune_rhmc_DSDR_action_or_md;
  int eofa_which_hsb;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_s"){
      assert(i < argc-1);
      tune_rhmc_s=true;
      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
    else if(sarg == "--tune_rhmc_DSDR"){
      assert(i < argc-1);
      tune_rhmc_DSDR=true;
      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_DSDR"){
      assert(i < argc-1);
      eigenrange_DSDR=true;
      lanc_params_DSDR = argv[i+1];
    }
    else if(sarg == "--check_eofa"){
      assert(i < argc-1);
      check_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
    }
    else if(sarg == "--upper_bound_eofa"){
      assert(i < argc-1);
      upper_bound_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
    else if(sarg == "--lower_bound_eofa"){
      assert(i < argc-1);
      lower_bound_eofa = true;      
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
  }
  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
    std::cout << GridLogMessage << "Running checks" << std::endl;
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
    if(check_eofa){
      if(eofa_which_hsb >= 0){
 	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
 	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
      }else{
 	for(int i=0;i<n_light_hsb;i++){
 	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
 	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
 	}
      }
    }	  
    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -81,8 +81,8 @@ int main (int argc, char ** argv)
    Vector<Coeff_t> diag = Dw.bs;
    Vector<Coeff_t> upper= Dw.cs;
    Vector<Coeff_t> lower= Dw.cs;
-    upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
+    upper[Ls-1]=-Dw.mass*upper[Ls-1];
-    lower[0]   =-Dw.mass_plus*lower[0];
+    lower[0]   =-Dw.mass*lower[0];
    LatticeFermion r_eo(FGrid);
    LatticeFermion src_e (FrbGrid);
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@ -44,13 +44,6 @@ void bench_wilson (
 		   double const     volume,
 		   int const           dag );
 void bench_wilson_eo (
       LatticeFermion &    src,
       LatticeFermion & result,
       WilsonFermionR &     Dw,
       double const     volume,
       int const           dag );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@ -117,8 +110,8 @@ int main (int argc, char ** argv)
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << "\t";
    // EO
-    bench_wilson_eo(src_o,result_e,Dw,volume,DaggerNo);
+	  bench_wilson(src,result,Dw,volume,DaggerNo);
-    bench_wilson_eo(src_o,result_e,Dw,volume,DaggerYes);
+	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
 	}
    }
--- a/configure.ac
+++ b/configure.ac
@ -159,7 +159,7 @@ case ${ac_ZMOBIUS} in
 esac
 ############### Nc
 AC_ARG_ENABLE([Nc],
-    [AC_HELP_STRING([--enable-Nc=2|3|4|5], [enable number of colours])],
+    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
    [ac_Nc=${enable_Nc}], [ac_Nc=3])
 case ${ac_Nc} in
--- a/scripts/hmc.sh
+++ b/scripts/hmc.sh
@ -1,19 +1,27 @@
 #!/bin/bash
 LOG=$1
-SWEEPS=`grep dH $LOG | wc -l`
+SWEEPS=`grep dH.= $LOG | wc -l`
-SWEEPS=`expr $SWEEPS - 80`
+SWEEPS=`expr $SWEEPS - 100`
 echo
 echo $SWEEPS thermalised sweeps
 echo
-plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10} END { print S/NR} ' `
+plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12} END { print S/NR} ' `
-plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12 ; SS=SS+$12*$12 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Plaquette: $plaq (${plaqe})"
 echo
-dHv=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt(SS/NR) } ' `
+grep  Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12/20; if(NR%20==0){ print NR/20, " ", S; S=0;} } '  > plaq.binned
-edH=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$10)} END { print S/NR} '`
+
-echo "<e-dH>: $edH"
+plaq=`cat plaq.binned  | awk '{ S=S+$2} END { print S/NR} ' `
 plaqe=`cat plaq.binned | awk '{ S=S+$2 ; SS=SS+$2*$2 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Binned Plaquette: $plaq (${plaqe})"
 echo
 dHv=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+$16 ; SS=SS+$16*$16 } END { print sqrt(SS/NR) } ' `
 edH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16)} END { print S/NR} '`
 dedH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16); SS=SS+exp(-$16)*exp(-$16)} END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } '`
 echo "<e-dH>: $edH (${dedH})"
 echo "<rms dH>: $dHv"
 TRAJ=`grep Acc $LOG | wc -l`
@ -22,12 +30,13 @@ PACC=`expr  100 \* ${ACC} / ${TRAJ} `
 echo
 echo "Acceptance $PACC %  $ACC / $TRAJ "
-grep Plaq $LOG | awk '{ print $10 }' | uniq > plaq.dat
+grep Plaq $LOG | awk '{ print $12 }' | uniq > plaq.dat
-grep dH $LOG | awk '{ print $10 }' > dH.dat
+grep dH.= $LOG | awk '{ print $16 }' > dH.dat
-echo set yrange [-0.2:1.0] > plot.gnu
+echo set yrange [0.58:0.60] > plot.gnu
 echo set terminal 'pdf' >> plot.gnu
 echo "f(x) =0.588" >> plot.gnu
 echo "set output 'plaq.${LOG}.pdf'" >> plot.gnu
-echo "plot 'plaq.dat' w l, 'dH.dat' w l " >> plot.gnu
+echo "plot 'plaq.dat' w l, f(x) " >> plot.gnu
 echo
 gnuplot plot.gnu >& gnu.errs
 open plaq.${LOG}.pdf
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@ -1 +0,0 @@
 CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
--- a/tests/IO/Test_field_array_io.cc
+++ b/tests/IO/Test_field_array_io.cc
@ -0,0 +1,184 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/IO/Test_field_array_io.cc
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 //This test demonstrates and checks a single-file write of an arbitrary array of fields
 uint64_t writeHeader(const uint32_t size, const uint32_t checksum, const std::string &format, const std::string &file){
  std::ofstream fout(file,std::ios::out|std::ios::in);
  fout.seekp(0,std::ios::beg);
  fout << std::setw(10) << size << std::endl;
  fout << std::hex << std::setw(10) << checksum << std::endl;
  fout << format << std::endl;
  return fout.tellp();
 }
 uint64_t readHeader(uint32_t &size, uint32_t &checksum, std::string &format, const std::string &file){
  std::ifstream fin(file);
  std::string line;
  getline(fin,line);
  {
    std::stringstream ss; ss <<line ; ss >> size;
  }
  getline(fin,line);
  {
    std::stringstream ss; ss <<line ; ss >> std::hex >> checksum;
  }
  getline(fin,format);
  removeWhitespace(format);
  return fin.tellg();
 }
 template<typename FieldType>
 void writeFieldArray(const std::string &file, const std::vector<FieldType> &data){
  typedef typename FieldType::vector_object vobj;
  typedef typename FieldType::scalar_object sobj;
  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
  BinarySimpleMunger<sobj, sobj> munge; //straight copy
  //We need a 2-pass header write, first to establish the size, the second pass writes the checksum
  std::string format = getFormatString<typename FieldType::vector_object>();
  uint64_t offset; //leave 64 bits for header
  if ( grid->IsBoss() ) { 
    NerscIO::truncate(file);
    offset = writeHeader(data.size(), 0, format, file);
  }
  grid->Broadcast(0,(void *)&offset,sizeof(offset)); //use as a barrier
  std::cout << "Data offset write " << offset << std::endl;
  std::cout << "Data size write " << data.size() << std::endl;
  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
  std::cout << "Field size = " << field_size << " B" << std::endl;
  uint32_t checksum = 0;
  for(int i=0;i<data.size();i++){
    std::cout << "Data field write " << i << " offset " << offset << std::endl;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    BinaryIO::writeLatticeObject<vobj,sobj>(const_cast<FieldType &>(data[i]),file,munge,offset,format,
 					    nersc_csum,scidac_csuma,scidac_csumb);
    offset += field_size;
    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
  }
  std::cout << "Write checksum " << checksum << std::endl;
  if ( grid->IsBoss() ) { 
    writeHeader(data.size(), checksum, format, file);
  }
 }
 template<typename FieldType>
 void readFieldArray(std::vector<FieldType> &data, const std::string &file){
  typedef typename FieldType::vector_object vobj;
  typedef typename FieldType::scalar_object sobj;
  assert(data.size() > 0);
  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
  BinarySimpleUnmunger<sobj, sobj> munge; //straight copy
  uint32_t hdr_checksum, hdr_size;
  std::string format;
  uint64_t offset = readHeader(hdr_size, hdr_checksum, format, file);
  std::cout << "Data offset read " << offset << std::endl;  
  std::cout << "Data size read " << hdr_size << std::endl;
  assert(data.size() == hdr_size);
  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
  uint32_t checksum = 0;
  for(int i=0;i<data.size();i++){
    std::cout << "Data field read " << i << " offset " << offset << std::endl;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    BinaryIO::readLatticeObject<vobj,sobj>(data[i],file,munge,offset,format,
 					   nersc_csum,scidac_csuma,scidac_csumb);
    offset += field_size;
    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
  }
  std::cout << "Header checksum " << hdr_checksum << std::endl;    
  std::cout << "Read checksum " << checksum << std::endl;
  assert( hdr_checksum == checksum );
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  Coordinate latt   = GridDefaultLatt();
  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt, simd_layout, mpi_layout);
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  typedef DomainWallFermionD::FermionField FermionField;
  int nfield = 20;
  std::vector<FermionField> data(nfield, FGrid);
  for(int i=0;i<data.size();i++)
    gaussian(RNG5, data[i]);
  std::string file = "test_field_array_io.0";
  writeFieldArray(file, data);
  std::vector<FermionField> data_r(nfield, FGrid);
  readFieldArray(data_r, file);
  for(int i=0;i<nfield;i++){
    FermionField diff = data_r[i] - data[i];
    RealD norm_diff = norm2(diff);
    std::cout << "Norm2 of difference between stored and loaded data index " << i << " : " << norm_diff << std::endl;
  }
  std::cout << "Done" << std::endl;
  Grid_finalize();
 }
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@ -147,7 +147,7 @@ int main (int argc, char ** argv)
  Complex p  = TensorRemove(Tp);
  std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;
-  Complex LinkTraceScale(1.0/vol/4.0/(Real)Nc);
+  Complex LinkTraceScale(1.0/vol/4.0/3.0);
  TComplex Tl = sum(LinkTrace);
  Complex l  = TensorRemove(Tl);
  std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;
@ -157,10 +157,8 @@ int main (int argc, char ** argv)
  Complex ll= TensorRemove(TcP);
  std::cout<<GridLogMessage << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;
-  const string stNc   = to_string( Nc   ) ;
+  std::string clone2x3("./ckpoint_clone2x3.4000");
-  const string stNcM1 = to_string( Nc-1 ) ;
+  std::string clone3x3("./ckpoint_clone3x3.4000");
  std::string clone2x3("./ckpoint_clone"+stNcM1+"x"+stNc+".4000");
  std::string clone3x3("./ckpoint_clone"+stNc+"x"+stNc+".4000");
  NerscIO::writeConfiguration(Umu,clone3x3,0,precision32);
  NerscIO::writeConfiguration(Umu,clone2x3,1,precision32);
--- a/tests/core/Test_compact_wilson_clover_speedup.cc
+++ b/tests/core/Test_compact_wilson_clover_speedup.cc
@ -117,8 +117,8 @@ void runBenchmark(int* argc, char*** argv) {
  // type definitions
  typedef WilsonImpl<vCoeff_t, FundamentalRepresentation, CoeffReal> WImpl;
-  typedef WilsonCloverFermion<WImpl, CloverHelpers<WImpl>> WilsonCloverOperator;
+  typedef WilsonCloverFermion<WImpl> WilsonCloverOperator;
-  typedef CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>> CompactWilsonCloverOperator;
+  typedef CompactWilsonCloverFermion<WImpl> CompactWilsonCloverOperator;
  typedef typename WilsonCloverOperator::FermionField Fermion;
  typedef typename WilsonCloverOperator::GaugeField Gauge;
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@ -299,12 +299,12 @@ int main (int argc, char ** argv)
    SpinColourVectorD ferm; gaussian(sRNG,ferm);
    pokeSite(ferm,src,point);
-    const int Ls=32;
+    const int Ls=64;
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
-    RealD mass=0.01;
+    RealD mass=1.0;
-    RealD M5  =0.8;
+    RealD M5  =0.99;
    DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5);
    // Momentum space prop
@ -353,6 +353,12 @@ int main (int argc, char ** argv)
    std::cout << " Taking difference" <<std::endl;
    std::cout << "Ddwf result4 "<<norm2(result4)<<std::endl;
    std::cout << "Ddwf ref     "<<norm2(ref)<<std::endl;
    auto twopoint = localInnerProduct(result4,result4);
    std::vector<TComplex> pion_prop;
    sliceSum(twopoint,pion_prop,Nd-1);
    for(int t=0;t<pion_prop.size();t++){
      std::cout << "Pion_prop["<<t<<"]="<<pion_prop[t]<<std::endl;
    }
    diff = ref - result4;
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
@ -383,7 +389,7 @@ int main (int argc, char ** argv)
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
-    RealD mass=0.01;
+    RealD mass=1.0;
    RealD M5  =0.8;
    OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Christopher Kelly	4fefae1745	Test_evec_compression changes: Added ability to choose one of a variety of preselected basis sizes from the command line Fine lanczos now checks enough evecs are generated and resizes the output to Nstop and not the actual amount that converged (which can be larger)	2022-04-06 06:33:26 -07:00
Christopher Kelly	758e2edcad	Test_evec_compression enhancements: In testing the compressed evecs, a Cheybshev smoothing is now applied first to remove high mode noise Added a second test where the uncompressed evecs are compared directly to the original evecs Generalized the test to allow for either DWF or Mobius with or without GPBC, switched by command line options	2022-03-29 06:16:15 -07:00
Christopher Kelly	1538b15f3b	48ID evo main program now uses reliable update CG	2022-03-14 06:45:28 -07:00
Christopher Kelly	deac621c2c	Merge branch 'develop' into gparity_HMC_merge_develop	2022-02-22 14:25:27 -05:00
Christopher Kelly	ba974960e6	Added an HMC checkpoint start option that loads the fields and then reseeds the RNGs, suitable for creating new evolution streams Added option to choose RNG seeds in 40ID main binary	2022-02-14 08:09:01 -08:00
Christopher Kelly	6755dc57f8	Added methods to compute spatial plaquette and timeslice spatial plaquette to WilsonLoops	2022-01-24 13:57:39 -05:00
Christopher Kelly	aa620ca52c	Fixed compilation error in observables resulting from changes in Wilson flow code Modified light quark mass on 40ID HMC binary	2022-01-24 09:56:24 -08:00
Christopher Kelly	2c46c942cc	Reworked WilsonFlow: Both smear and smear_adaptive now maintain the Wilson flow time as a function variable rather than a class member variable. smear_adaptive does likewise for the current time step. This allows the evolve and smear functions to be const Fixed smear_adaptive setting initial time to epsilon rather than 0 Added ability to assign generic measurement actions at user specified frequencies during the smearing and reimplemented current energy density / topq output in this framework Reimplemented the "flowMeasure" methods using the above framework Fixed const correctness for WilsonLoops::TopologicalCharge	2022-01-24 12:06:05 -05:00
Christopher Kelly	adeba8059a	Added calculation of timeslice topological charge	2022-01-20 14:29:07 -05:00
Christopher Kelly	c4ac528126	Added cloverleaf energy density calculation to WilsonFlow	2021-12-27 10:33:33 -05:00
Christopher Kelly	551b93ba8e	To HMC/Mobius2p1fIDSDRGparityEOFA_40ID, added input param to change trajectory length and increased integrator steps for DSDR	2021-12-10 09:06:06 -08:00
Christopher Kelly	ddf7540510	Added calculation of 5Li topological charge WilsonFlow code now calls topological charge calculation with correct gauge implementation rather than assuming periodic Added version of WilsonFlow::flowMeasureEnergyDensityPlaquette that outputs the smeared gauge field at the end	2021-12-06 17:56:42 -05:00
Christopher Kelly	de68d12c3d	1x1 topological charge calculation now respects gauge boundary conditions	2021-12-06 13:42:09 -05:00
Christopher Kelly	6d26a2a1ad	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into gparity_HMC	2021-11-16 07:32:47 -08:00
Christopher Kelly	a1211cdcce	Gparity 48ID tuning and exposure of trajectory length as input variable	2021-11-16 07:31:41 -08:00
Christopher Kelly	e78acf77ff	To LocalCoherenceLanczos, added a method to reconstruct the fine eigenvector and added some comments to aid the user Added a test code for local coherence Lanczos with G-parity BCs Added a test code for block eigenvector compression	2021-11-08 07:26:35 -08:00
Christopher Kelly	f7e9621492	40ID ensemble tuning: now use 5 Hasenbusch steps, parameters now separately tunable in param file	2021-10-18 08:17:36 -07:00
Christopher Kelly	f14be15f8b	Updates to Gparity HMC main programs	2021-10-15 08:10:17 -07:00
Christopher Kelly	6a3aaa52ef	Test_dwf_lanczos can now run either G-parity Mobius or non-Gparity DWF according to cmdline switch Fixed copyStream intialization	2021-10-12 12:59:54 -07:00
Christopher Kelly	9ba47b4696	Merge branch 'develop' into gparity_HMC	2021-09-29 20:07:55 -07:00
Christopher Kelly	e85af80c39	Added return value checks on all cuda api calls Test_dwf_lanczos can now run with either regular DWF or Mobius+Gparity based on cmdline arg	2021-09-29 19:57:43 -07:00
Christopher Kelly	0b91e90dd4	Merge branch 'develop' into feature/gparity_HMC	2021-09-27 07:16:26 -07:00
Christopher Kelly	d184b8c921	Merge branch 'develop' into gparity_HMC	2021-09-08 06:14:08 -07:00
Christopher Kelly	c92e390b08	Added initial main binary code for 40ID and 48ID Gparity HMC	2021-09-08 09:00:13 -04:00
Christopher Kelly	5b36a8af54	Added a CshiftLink function to the GaugeImplementations and boundary condition classes that offers a boundary aware C-shift Modified gauge fixing code to use CshiftLink internally such that the steepest descent algorithm is universal Modified gauge transformation code to use CshiftLink for a universal definition Improved comprehensibility of Test_fft_gfix and generalized to use either periodic or charge conjugation BCs based on cmdline option Added cmdline options to Test_fft_gfix to tune alpha and optionally disable the Fourier acceleration tests	2021-07-12 17:13:40 -04:00
Christopher Kelly	75a1f85162	Added method to compute and return the Wilson flow energy density over some number of steps	2021-06-30 17:24:00 -04:00
Christopher Kelly	ac4f2d9798	Fixed EOFA approx test square rooting the result inappropriately thus failing when it shouldn't To MDWF+ID GPBC evol main program, added routine to compute the lower bound of the EOFA using the power method with a command line toggle	2021-06-09 09:08:37 -04:00
Christopher Kelly	c3b99de33f	In EOFA pseudofermion action, implemented M^{-1} (this costs the same as M for EOFA!) Added tests/solver/Test_eofa_inv.cc to test the above In MDWF+ID GPBC binary, tests of RHMC approx for the action / MD approxs can be performed separately using a cmdline toggle	2021-06-03 11:11:14 -04:00
Christopher Kelly	e1a02bb80a	Added main program to reproduce 32ID ensemble with 240MeV pions and GPBC Allowed EOFA to accept different solvers for the L and R operations in the heatbath step Fixed EOFA Meofa operating on member Phi rather than input field Added derived EOFA pseudofermion variant that allows for mixed prec CG to be used in the heatbath Added forces/Test_mobius_gparity_eofa_mixed testing the above reproduces the regular EOFA To Test_gamma, added checks for the various properties of the charge conjugation matrix C=-gamma2*gamma4 in Grid basis	2021-06-01 11:44:34 -04:00
Christopher Kelly	86f08c6b9a	Added a check that the initial EOFA action agrees with \|eta\|^2, thus checking the quality of the rational approximation in the heatbath	2021-05-18 13:57:44 -04:00
Christopher Kelly	9f0271039f	Completed implementation of Meofa method of ExactOneFlavourRatio pseudofermion action Added tests to tests/forces/Test_mobius_force_eofa.cc testing that the EOFA heatbath results in Phi = M^{-1/2} eta	2021-05-18 12:27:51 -04:00
Christopher Kelly	24df770f74	Added tests/IO/Test_field_array_io.cc testing/demonstrating parallel IO of an array of 5D fermion fields	2021-05-13 12:32:45 -04:00
Christopher Kelly	45b6c7effc	Added a test code forces/Test_gpdwf_force_1f_2f that compares the action and force for DWF, EOFA and DSDR actions between the 1f and 2f implementations of G-parity BCs Broke up ExactOneFlavourRatio refresh into a virtual routine that generates eta and one that uses it as with the ratio and RHMC actions Added accessors to the pseudofermion field to TwoFlavourEvenOddRatio and ExactOneFlavourRatio	2021-05-12 16:34:07 -04:00
Quadro	1c70d8c4d9	Warning remove	2021-05-05 19:56:04 -04:00
Quadro	f0e9a5299f	Happy on GCC I hope	2021-05-05 19:55:34 -04:00
Quadro	f1b8ba45e7	Warning on GCC suppress unrelated to my code so why doesn't it shut up about its ABI fix	2021-05-05 19:54:21 -04:00
Peter Boyle	fe998ab578	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC	2021-05-05 17:36:51 -04:00
Peter Boyle	c2ee2b5fd1	Random chhanges	2021-05-05 17:36:38 -04:00
Peter Boyle	3b734ee397	two point function example	2021-05-05 17:36:19 -04:00
Peter Boyle	8637a9512a	Freeze Gaussian implementation	2021-05-05 17:34:54 -04:00
Peter Boyle	7f6e2ee03e	Drop normal_distribution, standardise	2021-05-05 17:34:17 -04:00
Peter Boyle	7b02acb2bd	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC	2021-05-04 13:45:11 -04:00
Peter Boyle	86948c6ea0	CRC for finger print fields - aids debug / version diff	2021-05-04 13:44:38 -04:00
Peter Boyle	53d226924a	CRC added	2021-05-04 13:44:07 -04:00
Christopher Kelly	80176b1b39	RHMC now outputs some initial norms to the logs Fixed DWF+I Gparity binaries not correctly assigning twist directions (thanks Peter!)	2021-05-04 13:12:23 -04:00
Christopher Kelly	29ddafd0fc	Added variant of G-parity DWF+I ensemble gen code using double prec RHMC	2021-04-30 13:12:24 -04:00
Peter Boyle	0f08364e4f	Mom filter refresh sRNG	2021-04-26 23:18:11 +02:00
Peter Boyle	a198d59381	Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC	2021-04-26 21:05:52 +02:00
Peter Boyle	3a4f5f2324	Merge develop, strengthen force tests	2021-04-22 18:54:00 -04:00
Peter Boyle	824d84473f	Merge branch 'develop' into feature/gparity_HMC	2021-04-22 16:32:41 -04:00
Peter Boyle	38964a4076	Switch twist direction	2021-04-22 15:57:37 -04:00
Peter Boyle	0d9aa87228	Reduce momentum to the GP plane	2021-04-22 15:56:59 -04:00
Peter Boyle	0e959d9b94	Update plaquette analysis	2021-04-22 15:55:47 -04:00
Peter Boyle	752f70cd48	Merge branch 'develop' into feature/gparity_HMC	2021-04-22 01:58:11 +02:00
Christopher Kelly	e0e42873c1	Const correctness for Lattice::Replicate Adapted GeneralEvenOddRationalRatio and Test_rhmc_EOWilsonRatio_doubleVsMixedPrec to recent changes that require passing in serial RNG For GeneralEvenOddRationalRatio and TwoFlavourEvenOddRatio, broke refresh into two stages, the first of which generates the random field and the second that computes the pseudofermion field. This allows derived classes to override the generation of the random field, for example in testing. Test_dwf_gpforce now uses Gparity in x-direction and APBC in time as opposed to G-parity in time Added Test_action_dwf_gparity2fvs1f that compares the DWF fermion action with the 2f and the 1f (doubled-lattice) implementations of Gparity	2021-04-14 16:41:27 -04:00
Christopher Kelly	0ff3bf6dc5	Merge branch 'develop' into feature/gparity_HMC	2021-03-22 15:33:13 -04:00
Christopher Kelly	351eab02ae	Comment fix	2021-03-22 14:39:17 -04:00
Christopher Kelly	feee5ccde2	Added Gparity flavour Pauli matrix algebra and associated tensor types mirroring strategy used for Gamma matrices Added test program for the above	2021-03-03 15:39:41 -05:00
Christopher Kelly	e0f6a146d8	To DWF+I G-parity evolution code, added ability to specify number of MD steps in params and an optional usage mode that reads the config and checks the plaq/checksum agree then exits	2021-02-16 10:41:52 -05:00
Christopher Kelly	daa095c519	Fixed an obscure but reproducible hang in the RHMC caused by the bounds check being activated by a random number that wasn't synchronized over the nodes HMC now also reports the "L-infinity norm" of the impulse, aka the largest site norm	2021-02-09 12:55:46 -05:00
Christopher Kelly	c2676853ca	Merge branch 'bugfix/maxnorm2' into feature/gparity_HMC	2021-02-08 12:17:33 -05:00
Christopher Kelly	6a824033f8	Merge branch 'develop' into feature/gparity_HMC	2021-02-08 09:31:49 -05:00
Christopher Kelly	cee6a37639	Added a logging tag for HMC As the integrator logger is active by default the cmdline option to activate had no effect. Changed option to deactivate on request ("NoIntegrator") Cleaned up generating rational approxs in the general RHMC code As the tolerance of the rational approx is not related to the CG tolerance, regenerating approxs for MD and MC if they differ only by the CG tolerance is not necessary; this has been fixed In DWF+I Gparity evolution code, added cmdline options to check the rational approximations and compute the lowest/highest eigenvalues of M^dagM for RHMC tuning In the above, changed the integrator layout to a much simpler one that completes much faster; may need additional tuning	2021-02-08 09:30:35 -05:00
Christopher Kelly	6cc3ad110c	Improved logging output for RHMC bounds checks In GenericHMCRunner, exposed functionality for initializing gauge fields and RNG for external use	2021-01-29 12:35:00 -05:00
Christopher Kelly	e6c6f82c52	Gparity DWF+I HMC main program now has option to specify parameter file	2021-01-27 11:18:41 -05:00
Christopher Kelly	d10d0c4e7f	Merge branch 'develop' into feature/gparity_HMC	2021-01-25 15:13:29 -05:00
Christopher Kelly	9c106d625a	Added HMC main program designed to reproduce the 16^3x32x16 DWF+I ensembles with beta=2.13 and Gparity BCs	2021-01-25 15:07:44 -05:00
Christopher Kelly	6795bbca31	Generalized GeneralEvenOddRatioRationalPseudoFermionAction such that the multi-shift CG algorithm can be overridden by derived classes Added a mixed-precision variant of GeneralEvenOddRatioRationalPseudoFermionAction and a verification test against double prec class Fixed non-const reference used in passing RHMC approx to multishift classes	2021-01-25 14:22:31 -05:00
Christopher Kelly	d161c2dc35	Improved formating of timing output in mixed-prec multishift In test of mixed-prec multishift, added comparison against full double precision multishift both for timing and to cross-check the results	2021-01-20 15:42:06 -05:00
Christopher Kelly	7a06826cf1	Added option to NerscIO to disable exit on failing plaquette check allowing for circumvention of factor of 2 error in CPS-generated G-parity config headers Adapted mixed-prec multi-shift test to new way to pass gauge BC directions and added cmdline option to perform the G-parity plaquette comparison with the corrected plaquette when loading config	2021-01-20 13:31:50 -05:00
Christopher Kelly	c3712b8e06	Merge branch 'develop' into feature/gparity_HMC	2021-01-20 11:48:52 -05:00
Christopher Kelly	901ee77b84	Mixed precision multishift test can now be performed with/without G-parity using cmdline check and can load a pregenerated configuration	2021-01-20 11:45:44 -05:00
Christopher Kelly	1b84f59273	Added a mixed precision multishift algorithm for which the matrix multiplies are performed in single precision but the search directions are accumulated in double precision. A reliable update step is performed at a tunable frequency to correct the residual. A final mixed-prec single-shift solve is performed on each pole to perform cleanup if necessary. A test is provided to demonstrate the algorithm.	2021-01-06 12:24:44 -05:00
Christopher Kelly	1fb41a4300	Added copyLane function to Tensor_extract_merge.h which copies one lane of data from an input tensor object to a different lane of an output tensor object of potentially different precision precisionChange lattice function now uses copyLane to remove need for temporary scalar objects, reducing register footprint and significantly improving performance	2021-01-06 11:50:56 -05:00
Christopher Kelly	287bac946f	ConjugateGradientMixedPrec now stores final true residual and uses the precisionChange workspaces for improved efficiency	2021-01-06 09:50:41 -05:00
Christopher Kelly	80c14be65e	Added core test to check precision change	2021-01-06 09:34:44 -05:00
Christopher Kelly	d7a2a4852d	Reimplemented precisionChange to run on GPUs. A workspace containing the mapping table can be optionally precomputed and reused for improved performance.	2021-01-06 09:30:49 -05:00
Christopher Kelly	d185f2eaa7	OneFlavourEvenOddRatioRationalPseudoFermionAction now derives from GeneralEvenOddRatioRationalPseudoFermionAction, simply performs transcription of parameters	2020-12-23 16:26:10 -05:00
Christopher Kelly	813d4cd900	Added test program that ensures the generic checkerboarded RHMC (with parameters set appropriately) gives the same answer as the existing 1f code	2020-12-23 16:01:42 -05:00
Christopher Kelly	75c6c6b173	General RHMC pseudofermion action now allows for different rational approximations to be used in the MD and action evaluation	2020-12-23 11:19:26 -05:00
Christopher Kelly	220ad5e3ee	Added more verbose log output to GeneralEvenOddRatioRationalPseudoFermionAction In GeneralEvenOddRatioRationalPseudoFermionAction, setting the bounds check frequency to 0 now disables the check	2020-12-22 11:08:22 -05:00
Christopher Kelly	ba5dc670a5	Reimplemented GparityWilsonImpl::InsertForce5D to run efficiently on GPUs Swapped order of templated tensor code and c-number specializations in Tensor_outer.h to fix compile issue with type deduction on Summit	2020-12-22 10:10:07 -05:00
Christopher Kelly	a0ca362690	Added an RHMC pseudofermion action, GeneralEvenOddRatioRationalPseudoFermionAction, that works for an arbitrary fractional power, not just a square root Added a test evolution for the above, Test_rhmc_EOWilsonRatioPowQuarter, demonstrating conservation of Hamiltonian Fixed HMC ignoring the MetropolisTest parameter of HMCparameters	2020-12-17 16:21:58 -05:00
Christopher Kelly	249b6e61ec	For G-parity BCs the Nd-1 direction is now assumed to be the time direction and setting a twist in this direction will apply antiperiodic BCs Added option to run Test_gparity with antiperiodic time BCs	2020-12-17 14:09:00 -05:00
		`@ -1 +0,0 @@`
			`CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi`