Test_evec_compression changes:

Added ability to choose one of a variety of preselected basis sizes from the command line Fine lanczos now checks enough evecs are generated and resizes the output to Nstop and not the actual amount that converged (which can be larger)
Test_evec_compression enhancements:
2026-02-18 21:00:53 +00:00 · 2022-04-06 06:33:26 -07:00 · 2022-03-29 06:16:15 -07:00 · 2022-03-14 06:45:28 -07:00 · 2022-02-22 14:25:27 -05:00 · 2022-02-16 14:01:43 +00:00
168 changed files with 14029 additions and 1302 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -34,6 +34,9 @@ directory
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 #if defined __GNUC__ 
 #pragma GCC diagnostic ignored "-Wpsabi"
 #endif
 //disables and intel compiler specific warning (in json.hpp)
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];
 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
  virtual ~LinearOperatorBase(){};
 };
@@ -507,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@@ -585,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -598,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+template<class Field> using Preconditioner =  LinearFunction<Field> ;
 /*
 template<class Field> class Preconditioner :  public LinearFunction<Field> {
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
 */
 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
  virtual ~SparseMatrixBase() {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -292,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -48,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -67,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@@ -79,6 +82,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;
    GridBase* DoublePrecGrid = src_d_in.Grid();
    //Generate precision change workspaces
    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
@@ -96,6 +104,7 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@@ -119,7 +128,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@@ -129,6 +138,7 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -137,7 +147,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -149,6 +159,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -182,6 +182,9 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,411 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d, wk_f_from_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f, wk_d_from_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d, wk_f_from_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
@@ -57,6 +60,7 @@ private:
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@@ -87,6 +91,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -44,6 +44,7 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -67,6 +68,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -97,6 +99,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -153,6 +156,7 @@ public:
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -179,8 +183,16 @@ public:
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -199,13 +211,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -283,6 +295,10 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -326,6 +342,8 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -374,25 +392,31 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
+    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -403,6 +427,14 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -29,6 +29,8 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;
@@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
@@ -170,7 +171,7 @@ public:
    LinalgTimer.Start();
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    //p[0],q[0],qq[0] 
@@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);
      LinalgTimer.Start();
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
-  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
+  std::cout << " MemoryManager : PrintBytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
  uint64_t cacheBytes;
  cacheBytes = CacheBytes[Cpu];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Acc];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Shared];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
 }
 //////////////////////////////////////////////////////////////////////
@@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
-
+uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
  total_device+=bytes;
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
  total_device-=bytes;
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
    total_device-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorFree "<<std::endl;
  PrintBytes();
 #endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
  total_shared+=bytes;
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
  total_shared-=bytes;
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
    total_shared-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #endif
@@ -115,7 +159,6 @@ void MemoryManager::Init(void)
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
@@ -181,13 +224,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
 #endif
 }
-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -211,6 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  if ( entries[v].valid ) {
    ret = entries[v].address;
    cacheBytes -= entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@@ -219,6 +263,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
  cacheBytes += bytes;
  return ret;
 }
@@ -228,13 +273,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
 #endif
 }
-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -243,6 +288,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
      cacheBytes -= entries[e].bytes;
      return entries[e].address;
    }
  }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -82,14 +82,15 @@ private:
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
  static uint64_t CacheBytes[NallocType];
  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
  static void PrintBytes(void);
 public:
@@ -169,6 +170,7 @@ private:
 public:
  static void Print(void);
  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -3,7 +3,7 @@
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)
@@ -429,6 +429,7 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
  PrintBytes();
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
@@ -473,6 +474,32 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
  }
 }
 void MemoryManager::PrintState(void* _CpuPtr)
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if ( EntryPresent(CpuPtr) ){
    auto AccCacheIterator = EntryLookup(CpuPtr);
    auto & AccCache = AccCacheIterator->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
    << "\t" << AccCache.cpuLock
    << "\t" << AccCache.accLock
    << "\t" << AccCache.LRU_valid<<std::endl;
  } else {
    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
  }
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -16,6 +16,10 @@ uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 void  MemoryManager::PrintState(void* CpuPtr)
 {
 std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 };
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -388,8 +388,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    // TODO : make a OMP loop on CPU, call threaded bcopy
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    acceleratorCopySynchronise(); // MPI prob slower
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -400,6 +400,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
  int nreq=list.size();
  if (nreq==0) return;
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -88,6 +88,13 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
  // Helper function to print the state of this object in the AccCache
  void PrintCacheState(void)
  {
    MemoryManager::PrintState(this->_odata);
  }
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -0,0 +1,42 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
  if (warpSize != WARP_SIZE) {
@@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
    exit(EXIT_FAILURE);
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -32,8 +32,9 @@
 #include <random>
 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
 #include <Grid/random/gaussian.h>
 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -142,8 +143,8 @@ public:
  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
+  std::vector<Grid::gaussian_distribution<RealD> >       _gaussian;
-  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  //  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;
  ///////////////////////
@@ -243,8 +244,8 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    //    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }
@@ -357,8 +358,8 @@ public:
    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    //    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
@@ -515,11 +516,11 @@ public:
 template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
 template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
-template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
 template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
 template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
-template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
  autoView(full_v, full, AcceleratorRead);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++) {
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(half_v[ssh],full_v(ss));
    }
  });
 }
 template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
  autoView(full_v , full, AcceleratorWrite);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++){
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(full_v[ss],half_v(ssh));
    }
  });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
@@ -785,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@@ -1010,54 +1080,96 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
-//Convert a Lattice from one precision to another
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
-template<class VobjOut, class VobjIn>
+class precisionChangeWorkspace{
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+  std::pair<Integer,Integer>* fmap_device; //device pointer
-{
+public:
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
-  for(int d=0;d<out.Grid()->Nd();d++){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+    assert(out_grid->Nd() == in_grid->Nd());
-  }
+    for(int d=0;d<out_grid->Nd();d++){
-  out.Checkerboard() = in.Checkerboard();
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
  GridBase *in_grid=in.Grid();
  GridBase *out_grid = out.Grid();
  typedef typename VobjOut::scalar_object SobjOut;
  typedef typename VobjIn::scalar_object SobjIn;
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
    out_icoor[lane].resize(ndim);
    out_grid->iCoorFromIindex(out_icoor[lane], lane);
  }
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
  autoView( out_v , out, CpuWrite);
  thread_for(out_oidx,out_grid->oSites(),{
    Coordinate out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
    Coordinate lcoor(out_grid->Nd());
    for(int lane=0; lane < out_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
      ptrs[lane] = &in_slex_conv[llex];
    }
-    merge(out_v[out_oidx], ptrs, 0);
+    int Nsimd_out = out_grid->Nsimd();
-  });
+
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another
 //Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -576,6 +576,8 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
  // in principle should do the line below, but that breaks backard compatibility with old data
  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -39,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -198,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -220,6 +235,16 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  GparityWilsonImplParams() : twists(Nd, 0) {};
 };
@@ -65,7 +66,8 @@ struct StaggeredImplParams {
 				    RealD, tolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq);
+				    int,   BoundsCheckFreq,
 				    RealD, BoundsCheckTol);
  // MaxIter and tolerance, vectors??
@@ -76,15 +78,61 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20)
+				int _BoundsCheckFreq=20,
 				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
 	degree(_degree),
        precision(_precision),
-        BoundsCheckFreq(_BoundsCheckFreq){};
+        BoundsCheckFreq(_BoundsCheckFreq),
        BoundsCheckTol(_BoundsCheckTol){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@@ -0,0 +1,240 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
    Copyright (C) 2020 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer <nils.meyer@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 // see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
 //
 // Modifications done here:
 //
 // Original: clover term = 12x12 matrix per site
 //
 // But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
 // Sufficient to store/transfer only the real parts of the diagonal and one triangular part
 // 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
 //
 // Here: Above but diagonal as complex numbers, i.e., need to store/transfer
 // 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
 //
 // Words per site and improvement compared to original (combined with the input and output spinors):
 //
 // - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
 // - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
 // - Here:     2*12 + 42    =  66 words -> 2.55 x less
 //
 // These improvements directly translate to wall-clock time
 //
 // Data layout:
 //
 // - diagonal and triangle part as separate lattice fields,
 //   this was faster than as 1 combined field on all tested machines
 // - diagonal: as expected
 // - triangle: store upper right triangle in row major order
 // - graphical:
 //        0  1  2  3  4
 //           5  6  7  8
 //              9 10 11 = upper right triangle indices
 //                12 13
 //                   14
 //     0
 //        1
 //           2
 //              3       = diagonal indices
 //                 4
 //                    5
 //     0
 //     1  5
 //     2  6  9          = lower left triangle indices
 //     3  7 10 12
 //     4  8 11 13 14
 //
 // Impact on total memory consumption:
 // - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
 // - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
 template<class Impl>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
  /////////////////////////////////////////////
  // Sizes
  /////////////////////////////////////////////
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  /////////////////////////////////////////////
  // Type definitions
  /////////////////////////////////////////////
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonFermion<Impl>              WilsonBase;
  typedef WilsonCloverHelpers<Impl>        Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  /////////////////////////////////////////////
  // Constructors
  /////////////////////////////////////////////
 public:
  CompactWilsonCloverFermion(GaugeField& _Umu,
 			    GridCartesian& Fgrid,
 			    GridRedBlackCartesian& Hgrid,
 			    const RealD _mass,
 			    const RealD _csw_r = 0.0,
 			    const RealD _csw_t = 0.0,
 			    const RealD _cF = 1.0,
 			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
 			    const ImplParams& impl_p = ImplParams());
  /////////////////////////////////////////////
  // Member functions (implementing interface)
  /////////////////////////////////////////////
 public:
  virtual void Instantiatable() {};
  int          ConstEE()     override { return 0; };
  int          isTrivialEE() override { return 0; };
  void Dhop(const FermionField& in, FermionField& out, int dag) override;
  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
  void M(const FermionField& in, FermionField& out) override;
  void Mdag(const FermionField& in, FermionField& out) override;
  void Meooe(const FermionField& in, FermionField& out) override;
  void MeooeDag(const FermionField& in, FermionField& out) override;
  void Mooee(const FermionField& in, FermionField& out) override;
  void MooeeDag(const FermionField& in, FermionField& out) override;
  void MooeeInv(const FermionField& in, FermionField& out) override;
  void MooeeInvDag(const FermionField& in, FermionField& out) override;
  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  /////////////////////////////////////////////
  // Member functions (internals)
  /////////////////////////////////////////////
  void MooeeInternal(const FermionField&        in,
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle);
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
  void ImportGauge(const GaugeField& _Umu) override;
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
 private:
  template<class Field>
  const MaskField* getCorrectMaskField(const Field &in) const {
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        return &this->BoundaryMaskOdd;
      } else {
        return &this->BoundaryMaskEven;
      }
    } else {
      return &this->BoundaryMask;
    }
  }
  template<class Field>
  void ApplyBoundaryMask(Field& f) {
    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
    assert(m != nullptr);
    CompactHelpers::ApplyBoundaryMask(f, *m);
  }
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
 public:
  RealD csw_r;
  RealD csw_t;
  RealD cF;
  bool open_boundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
  FermionField Tmp;
  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -153,6 +154,23 @@ typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoInd
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
 typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
 typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
 typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
 typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,6 +30,18 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+    //If this site is an global boundary site, perform the G-parity flavor twist
-
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -207,14 +232,19 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-        
+
-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
-          
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
-      LatticeCoordinate(coor,mu);
+    for(int mu=0;mu<Nd-1;mu++){
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -260,6 +290,38 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -298,28 +360,48 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
-  
+ 
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
+    int Ls=Btilde.Grid()->_fdimensions[0];
-    int Ls = Btilde.Grid()->_fdimensions[0];
+    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      GridBase *GaugeGrid = mat.Grid();
-      autoView( Atilde_v , Atilde, CpuRead);
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
-      autoView( Btilde_v , Btilde, CpuRead);
+
-      thread_for(ss,tmp.Grid()->oSites(),{
+      if( Params.twists[mu] ){
-	  for (int s = 0; s < Ls; s++) {
+	LatticeCoordinate(coor,mu);
-	    int sF = s + Ls * ss;
+      }
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      autoView( mat_v , mat, AcceleratorWrite);
-	  }
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-	});
+      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -4,10 +4,11 @@
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -29,7 +30,8 @@
 #pragma once
-#include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@@ -50,18 +52,15 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////
 template <class Impl>
-class WilsonCloverFermion : public WilsonFermion<Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  template <typename vtype>
+  INHERIT_CLOVER_TYPES(Impl);
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
-public:
+  typedef WilsonFermion<Impl>       WilsonBase;
-  typedef WilsonFermion<Impl> WilsonBase;
+  typedef WilsonCloverHelpers<Impl> Helpers;
  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@@ -72,42 +71,7 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                      const ImplParams &impl_p = ImplParams());
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@@ -124,250 +88,21 @@ public:
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
  {
    conformable(X.Grid(), Y.Grid());
    conformable(X.Grid(), force.Grid());
    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
    GaugeField clover_force(force.Grid());
    PropagatorField Lambda(force.Grid());
-    // Guido: Here we are hitting some performance issues:
+public:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
    Impl::extractLinkField(U, this->Umu);
    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 protected:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverField CloverTerm, CloverTermInv;                     // Clover term
-  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
-  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
-  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
-  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
      T_v[i]()(2, 3) = -F_v[i]()();
      T_v[i]()(3, 2) = F_v[i]()();
    });
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
      T_v[i]()(2, 3) = (F_v[i]()());
      T_v[i]()(3, 2) = -(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@@ -0,0 +1,761 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 // Helper routines that implement common clover functionality
 NAMESPACE_BEGIN(Grid);
 template<class Impl> class WilsonCloverHelpers {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
  static CloverField fillCloverYZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
    });
    return T;
  }
  static CloverField fillCloverXY(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverYT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverZT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  template<class _Spinor>
  static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
    auto CC = coalescedRead(C);
    mult(&phi, &CC, &chi);
  }
  template<class _SpinorField>
  inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
    const int Nsimd = SiteSpinor::Nsimd();
    autoView(out_v, out, AcceleratorWrite);
    autoView(phi_v, phi, AcceleratorRead);
    autoView(C_v,   C,   AcceleratorRead);
    typedef decltype(coalescedRead(out_v[0])) calcSpinor;
    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
      calcSpinor tmp;
      multClover(tmp,C_v[sss],phi_v(sss));
      coalescedWrite(out_v[sss],tmp);
    });
  }
 };
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  #if 0
  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #else
  template<typename vobj>
  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #endif
  static accelerator_inline int triangle_index(int i, int j) {
    if(i == j)
      return 0;
    else if(i < j)
      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
    else // i > j
      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
  }
  static void MooeeKernel_gpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(in_v,       in,       AcceleratorRead);
    autoView(out_v,      out,      AcceleratorWrite);
    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
    const uint64_t NN = Nsite * Ls;
    accelerator_for(ss, NN, Simd::Nsimd(), {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v(sF);
      auto diagonal_t = diagonal_v(sU);
      auto triangle_t = triangle_v(sU);
      for(int block=0; block<Nhs; block++) {
        int s_start = block*Nhs;
        for(int i=0; i<Nred; i++) {
          int si = s_start + i/Nc, ci = i%Nc;
          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
          for(int j=0; j<Nred; j++) {
            if (j == i) continue;
            int sj = s_start + j/Nc, cj = j%Nc;
            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
          };
        };
      };
      coalescedWrite(out_v[sF], res);
    });
  }
  static void MooeeKernel_cpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, CpuRead);
    autoView(triangle_v, triangle, CpuRead);
    autoView(in_v,       in,       CpuRead);
    autoView(out_v,      out,      CpuWrite);
    typedef SiteSpinor CalcSpinor;
 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
 #define PREFETCH_CLOVER(BASE) {                                     \
    uint64_t base;                                                  \
    int pf_dist_L1 = 1;                                             \
    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
                                                                    \
    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
    }                                                               \
                                                                    \
    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
    }                                                               \
  }
 // TODO: Implement/generalize this for other architectures
 // I played around a bit on KNL (see below) but didn't bring anything
 // #elif defined(AVX512)
 // #define PREFETCH_CLOVER(BASE) {                              \
 //     uint64_t base;                                           \
 //     int pf_dist_L1 = 1;                                      \
 //     int pf_dist_L2 = +4;                                     \
 //                                                              \
 //     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
 //     }                                                        \
 //                                                              \
 //     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
 //     }                                                        \
 //   }
 #else
 #define PREFETCH_CLOVER(BASE)
 #endif
    const uint64_t NN = Nsite * Ls;
    thread_for(ss, NN, {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v[sF];
      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
      auto triangle_t = triangle_v[sU];
      // upper half
      PREFETCH_CLOVER(0);
      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
      auto in_cc_1_0 = conjugate(in_t()(1)(0));
      auto in_cc_1_1 = conjugate(in_t()(1)(1));
      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
                  +           triangle_t()(0)( 0) * in_t()(0)(1)
                  +           triangle_t()(0)( 1) * in_t()(0)(2)
                  +           triangle_t()(0)( 2) * in_t()(1)(0)
                  +           triangle_t()(0)( 3) * in_t()(1)(1)
                  +           triangle_t()(0)( 4) * in_t()(1)(2);
      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
                  +           triangle_t()(0)( 5) * in_t()(0)(2)
                  +           triangle_t()(0)( 6) * in_t()(1)(0)
                  +           triangle_t()(0)( 7) * in_t()(1)(1)
                  +           triangle_t()(0)( 8) * in_t()(1)(2)
                  + conjugate(       res()(0)( 1));
      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
                  +           triangle_t()(0)( 5) * in_cc_0_1;
      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
                  +           triangle_t()(0)( 9) * in_t()(1)(0)
                  +           triangle_t()(0)(10) * in_t()(1)(1)
                  +           triangle_t()(0)(11) * in_t()(1)(2)
                  + conjugate(       res()(0)( 2));
      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
                  +           triangle_t()(0)( 6) * in_cc_0_1
                  +           triangle_t()(0)( 9) * in_cc_0_2;
      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
                  +           triangle_t()(0)(12) * in_t()(1)(1)
                  +           triangle_t()(0)(13) * in_t()(1)(2)
                  + conjugate(       res()(1)( 0));
      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
                  +           triangle_t()(0)( 7) * in_cc_0_1
                  +           triangle_t()(0)(10) * in_cc_0_2
                  +           triangle_t()(0)(12) * in_cc_1_0;
      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
                  +           triangle_t()(0)(14) * in_t()(1)(2)
                  + conjugate(       res()(1)( 1));
      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
                  +           triangle_t()(0)( 8) * in_cc_0_1
                  +           triangle_t()(0)(11) * in_cc_0_2
                  +           triangle_t()(0)(13) * in_cc_1_0
                  +           triangle_t()(0)(14) * in_cc_1_1;
      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
                  + conjugate(       res()(1)( 2));
      vstream(out_v[sF]()(0)(0), res()(0)(0));
      vstream(out_v[sF]()(0)(1), res()(0)(1));
      vstream(out_v[sF]()(0)(2), res()(0)(2));
      vstream(out_v[sF]()(1)(0), res()(1)(0));
      vstream(out_v[sF]()(1)(1), res()(1)(1));
      vstream(out_v[sF]()(1)(2), res()(1)(2));
      // lower half
      PREFETCH_CLOVER(1);
      auto in_cc_2_0 = conjugate(in_t()(2)(0));
      auto in_cc_2_1 = conjugate(in_t()(2)(1));
      auto in_cc_2_2 = conjugate(in_t()(2)(2));
      auto in_cc_3_0 = conjugate(in_t()(3)(0));
      auto in_cc_3_1 = conjugate(in_t()(3)(1));
      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
                  +           triangle_t()(1)( 0) * in_t()(2)(1)
                  +           triangle_t()(1)( 1) * in_t()(2)(2)
                  +           triangle_t()(1)( 2) * in_t()(3)(0)
                  +           triangle_t()(1)( 3) * in_t()(3)(1)
                  +           triangle_t()(1)( 4) * in_t()(3)(2);
      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
                  +           triangle_t()(1)( 5) * in_t()(2)(2)
                  +           triangle_t()(1)( 6) * in_t()(3)(0)
                  +           triangle_t()(1)( 7) * in_t()(3)(1)
                  +           triangle_t()(1)( 8) * in_t()(3)(2)
                  + conjugate(       res()(2)( 1));
      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
                  +           triangle_t()(1)( 5) * in_cc_2_1;
      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
                  +           triangle_t()(1)( 9) * in_t()(3)(0)
                  +           triangle_t()(1)(10) * in_t()(3)(1)
                  +           triangle_t()(1)(11) * in_t()(3)(2)
                  + conjugate(       res()(2)( 2));
      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
                  +           triangle_t()(1)( 6) * in_cc_2_1
                  +           triangle_t()(1)( 9) * in_cc_2_2;
      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
                  +           triangle_t()(1)(12) * in_t()(3)(1)
                  +           triangle_t()(1)(13) * in_t()(3)(2)
                  + conjugate(       res()(3)( 0));
      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
                  +           triangle_t()(1)( 7) * in_cc_2_1
                  +           triangle_t()(1)(10) * in_cc_2_2
                  +           triangle_t()(1)(12) * in_cc_3_0;
      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
                  +           triangle_t()(1)(14) * in_t()(3)(2)
                  + conjugate(       res()(3)( 1));
      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
                  +           triangle_t()(1)( 8) * in_cc_2_1
                  +           triangle_t()(1)(11) * in_cc_2_2
                  +           triangle_t()(1)(13) * in_cc_3_0
                  +           triangle_t()(1)(14) * in_cc_3_1;
      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
                  + conjugate(       res()(3)( 2));
      vstream(out_v[sF]()(2)(0), res()(2)(0));
      vstream(out_v[sF]()(2)(1), res()(2)(1));
      vstream(out_v[sF]()(2)(2), res()(2)(2));
      vstream(out_v[sF]()(3)(0), res()(3)(0));
      vstream(out_v[sF]()(3)(1), res()(3)(1));
      vstream(out_v[sF]()(3)(2), res()(3)(2));
    });
  }
  static void MooeeKernel(int                        Nsite,
                          int                        Ls,
                          const FermionField&        in,
                          FermionField&              out,
                          const CloverDiagonalField& diagonal,
                          const CloverTriangleField& triangle) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)
    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
 #else
    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
 #endif
  }
  static void Invert(const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle,
                     CloverDiagonalField&       diagonalInv,
                     CloverTriangleField&       triangleInv) {
    conformable(diagonal, diagonalInv);
    conformable(triangle, triangleInv);
    conformable(diagonal, triangle);
    diagonalInv.Checkerboard() = diagonal.Checkerboard();
    triangleInv.Checkerboard() = triangle.Checkerboard();
    GridBase* grid = diagonal.Grid();
    long lsites = grid->lSites();
    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
    autoView(diagonal_v,  diagonal,  CpuRead);
    autoView(triangle_v,  triangle,  CpuRead);
    autoView(diagonalInv_v, diagonalInv, CpuWrite);
    autoView(triangleInv_v, triangleInv, CpuWrite);
    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      scalar_object_diagonal diagonal_tmp     = Zero();
      scalar_object_diagonal diagonal_inv_tmp = Zero();
      scalar_object_triangle triangle_tmp     = Zero();
      scalar_object_triangle triangle_inv_tmp = Zero();
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
      peekLocalSite(triangle_tmp, triangle_v, lcoor);
      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
              else
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
            }
          }
        }
      }
      clover_inv_eigen = clover_eigen.inverse();
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else if(i < j)
                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else
                continue;
            }
          }
        }
      }
      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
    });
  }
  static void ConvertLayout(const CloverField&   full,
                            CloverDiagonalField& diagonal,
                            CloverTriangleField& triangle) {
    conformable(full, diagonal);
    conformable(full, triangle);
    diagonal.Checkerboard() = full.Checkerboard();
    triangle.Checkerboard() = full.Checkerboard();
    autoView(full_v,     full,     AcceleratorRead);
    autoView(diagonal_v, diagonal, AcceleratorWrite);
    autoView(triangle_v, triangle, AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else if(i < j)
                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else
                continue;
            }
          }
        }
      }
    });
  }
  static void ConvertLayout(const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverField&               full) {
    conformable(full, diagonal);
    conformable(full, triangle);
    full.Checkerboard() = diagonal.Checkerboard();
    full = Zero();
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(full_v,     full,     AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
              else
                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
            }
          }
        }
      }
    });
  }
  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
    // Checks/grid
    double t0 = usecond();
    conformable(diagonal, triangle);
    GridBase* grid = diagonal.Grid();
    // Determine the boundary coordinates/sites
    double t1 = usecond();
    int t_dir = Nd - 1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    // Set off-diagonal parts at boundary to zero -- OK
    double t2 = usecond();
    CloverTriangleField zeroTriangle(grid);
    zeroTriangle.Checkerboard() = triangle.Checkerboard();
    zeroTriangle = Zero();
    triangle = where(t_coor == 0,   zeroTriangle, triangle);
    triangle = where(t_coor == T-1, zeroTriangle, triangle);
    // Set diagonal to unity (scaled correctly) -- OK
    double t3 = usecond();
    CloverDiagonalField tmp(grid);
    tmp.Checkerboard() = diagonal.Checkerboard();
    tmp                = -1.0 * csw_t + diag_mass;
    diagonal           = where(t_coor == 0,   tmp, diagonal);
    diagonal           = where(t_coor == T-1, tmp, diagonal);
    // Correct values next to boundary
    double t4 = usecond();
    if(cF != 1.0) {
      tmp = cF - 1.0;
      tmp += diagonal;
      diagonal = where(t_coor == 1,   tmp, diagonal);
      diagonal = where(t_coor == T-2, tmp, diagonal);
    }
    // Report timings
    double t5 = usecond();
 #if 0
    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
              << " checks = "          << (t1 - t0) / 1e6
              << ", coordinate = "     << (t2 - t1) / 1e6
              << ", off-diag zero = "  << (t3 - t2) / 1e6
              << ", diagonal unity = " << (t4 - t3) / 1e6
              << ", near-boundary = "  << (t5 - t4) / 1e6
              << ", total = "          << (t5 - t0) / 1e6
              << std::endl;
 #endif
  }
  template<class Field, class Mask>
  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
    conformable(f, m);
    auto grid  = f.Grid();
    const uint32_t Nsite = grid->oSites();
    const uint32_t Nsimd = grid->Nsimd();
    autoView(f_v, f, AcceleratorWrite);
    autoView(m_v, m, AcceleratorRead);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, Nsite, Nsimd, {
      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
    });
  }
  template<class MaskField>
  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
    assert(!full.Grid()->_isCheckerBoarded);
    GridBase* grid = full.Grid();
    int t_dir = Nd-1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    MaskField zeroMask(grid); zeroMask = Zero();
    full = 1.0;
    full = where(t_coor == 0,   zeroMask, full);
    full = where(t_coor == T-1, zeroMask, full);
    pickCheckerboard(Even, even, full);
    pickCheckerboard(Odd,  odd,  full);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@@ -0,0 +1,92 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 class WilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteClover;
  typedef Lattice<SiteClover> CloverField;
 };
 template<class Impl>
 class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
  typedef iSinglet<Simd>            SiteMask;
  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
  typedef Lattice<SiteMask>           MaskField;
 };
 #define INHERIT_CLOVER_TYPES(Impl)                                 \
  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
 #define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
  /* ugly duplication but needed inside functionality classes */ \
  template<typename vtype> using iImplCloverDiagonal = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
  template<typename vtype> using iImplCloverTriangle = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
 #define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
      unsigned int t0 = 0;
      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
    } else {
      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -0,0 +1,363 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
                                                             GridCartesian& Fgrid,
                                                             GridRedBlackCartesian& Hgrid,
                                                             const RealD _mass,
                                                             const RealD _csw_r,
                                                             const RealD _csw_t,
                                                             const RealD _cF,
                                                             const WilsonAnisotropyCoefficients& clover_anisotropy,
                                                             const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
  , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
  , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
  , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
  , Tmp(&Fgrid)
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
  if (open_boundaries)
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
    } else {
      MooeeInternal(in, out, DiagonalEven, TriangleEven);
    }
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
    } else {
      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
    }
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!open_boundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
        continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  out.Checkerboard() = in.Checkerboard();
  conformable(in, out);
  conformable(in, diagonal);
  conformable(in, triangle);
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
 template<class Impl>
 void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
  double t0 = usecond();
  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
  // Initialize temporary variables
  double t1 = usecond();
  conformable(_Umu.Grid(), this->GaugeGrid());
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  double t3 = usecond();
  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
  TmpOriginal += this->diag_mass;
  // Convert the data layout of the clover term
  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Possible modify the boundary values
  double t5 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the clover term in the improved layout
  double t6 = usecond();
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
  // Fill the remaining clover fields
  double t7 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
  double t8 = usecond();
 #if 0
  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
            << ", allocations = "               << (t2 - t1) / 1e6
            << ", field strength = "            << (t3 - t2) / 1e6
            << ", fill clover = "               << (t4 - t3) / 1e6
            << ", convert = "                   << (t5 - t4) / 1e6
            << ", boundaries = "                << (t6 - t5) / 1e6
            << ", inversions = "                << (t7 - t6) / 1e6
            << ", pick cbs = "                  << (t8 - t7) / 1e6
            << ", total = "                     << (t8 - t0) / 1e6
            << std::endl;
 #endif
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -2,12 +2,13 @@
    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,6 +34,45 @@
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
                                               const RealD                         _csw_r,
                                               const RealD                         _csw_t,
                                               const WilsonAnisotropyCoefficients& clover_anisotropy,
                                               const ImplParams&                   impl_p)
  : WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , CloverTerm(&Fgrid)
  , CloverTermInv(&Fgrid)
  , CloverTermEven(&Hgrid)
  , CloverTermOdd(&Hgrid)
  , CloverTermInvEven(&Hgrid)
  , CloverTermInvOdd(&Hgrid)
  , CloverTermDagEven(&Hgrid)
  , CloverTermDagOdd(&Hgrid)
  , CloverTermInvDagEven(&Hgrid)
  , CloverTermInvDagOdd(&Hgrid) {
  assert(Nd == 4); // require 4 dimensions
  if(clover_anisotropy.isAnisotropic) {
    csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
    diag_mass = 4.0 + _mass;
  }
  csw_t = _csw_t * 0.5;
  if(csw_r == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
  if(csw_t == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
  ImportGauge(_Umu);
 }
 // *NOT* EO
 template <class Impl>
 void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
@@ -67,10 +107,13 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 template <class Impl>
 void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
  double t1 = usecond();
  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  double t2 = usecond();
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@@ -79,19 +122,22 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  double t3 = usecond();
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm  = Helpers::fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;
  double t4 = usecond();
  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;
  double t5 = usecond();
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
@@ -100,7 +146,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
+      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
@@ -125,6 +171,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
    });
  }
  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -137,6 +184,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
  double t7 = usecond();
 #if 0
  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
            << ", allocations = "               << (t2 - t1) / 1e6
            << ", field strength = "            << (t3 - t2) / 1e6
            << ", fill clover = "               << (t4 - t3) / 1e6
            << ", misc = "                      << (t5 - t4) / 1e6
            << ", inversions = "                << (t6 - t5) / 1e6
            << ", pick cbs = "                  << (t7 - t6) / 1e6
            << ", total = "                     << (t7 - t0) / 1e6
            << std::endl;
 #endif
 }
 template <class Impl>
@@ -167,7 +228,7 @@ template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
+  CloverField *Clover;
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  if (dag)
@@ -182,12 +243,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
+      Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
    }
  }
  else
@@ -205,18 +266,98 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
  }
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
 template <class Impl>
 void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
      continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 // Derivative parts
 template <class Impl>
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_32=ref()(3)(2);}
 #define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);	\
+  permute##dir(Chi_00,Chi_00);			\
-      permute##dir(Chi_01,Chi_01);\
+  permute##dir(Chi_01,Chi_01);			\
-      permute##dir(Chi_02,Chi_02);\
+  permute##dir(Chi_02,Chi_02);			\
-      permute##dir(Chi_10,Chi_10);	\
+  permute##dir(Chi_10,Chi_10);			\
-      permute##dir(Chi_11,Chi_11);\
+  permute##dir(Chi_11,Chi_11);			\
-      permute##dir(Chi_12,Chi_12);
+  permute##dir(Chi_12,Chi_12);
 #endif
@@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;
 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  {int ptype;					\
-  offset = SE->_offset;				\
+   SE=st.GetEntry(ptype,DIR,ss);		\
-  local  = SE->_is_local;			\
+   auto offset = SE->_offset;			\
-  perm   = SE->_permute;			\
+   auto local  = SE->_is_local;			\
-  if ( local ) {				\
+   auto perm   = SE->_permute;			\
-    LOAD_CHIMU(PERM);				\
+   if ( local ) {				\
-    PROJ;					\
+     LOAD_CHIMU(PERM);				\
-    if ( perm) {				\
+     PROJ;					\
-      PERMUTE_DIR(PERM);			\
+     if ( perm) {				\
-    }						\
+       PERMUTE_DIR(PERM);			\
-  } else {					\
+     }						\
-    LOAD_CHI;					\
+   } else {					\
-  }						\
+     LOAD_CHI;					\
-  acceleratorSynchronise();			\
+   }						\
-  MULT_2SPIN(DIR);				\
+   acceleratorSynchronise();			\
-  RECON;					
+   MULT_2SPIN(DIR);				\
   RECON;					}
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
-  SE=&st_p[DIR+8*ss];				\
+  { SE=&st_p[DIR+8*ss];						\
-  ptype=st_perm[DIR];				\
+  auto ptype=st_perm[DIR];					\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;					\
-  local  = SE->_is_local;			\
+  auto local  = SE->_is_local;					\
-  perm   = SE->_permute;			\
+  auto perm   = SE->_permute;					\
-  if ( local ) {				\
+  if ( local ) {						\
-    LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);						\
-    PROJ;					\
+    PROJ;							\
-    if ( perm) {				\
+    if ( perm) {						\
-      PERMUTE_DIR(PERM);			\
+      PERMUTE_DIR(PERM);					\
-    }						\
+    }								\
-  } else {					\
+  } else {							\
-    LOAD_CHI;					\
+    LOAD_CHI;							\
-  }						\
+  }								\
-  acceleratorSynchronise();			\
+  acceleratorSynchronise();					\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN(DIR);						\
-  RECON;					
+  RECON;					}
 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
+  { SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
+    auto ptype=st_perm[DIR];						\
- /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
+    auto offset = SE->_offset;						\
-  perm   = SE->_permute;			\
+    auto perm   = SE->_permute;						\
-  LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);							\
-  PROJ;						\
+    PROJ;								\
-  MULT_2SPIN(DIR);				\
+    MULT_2SPIN(DIR);							\
-  RECON;					
+    RECON;					}
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  local  = SE->_is_local;			\
+  auto offset = SE->_offset;					\
-  perm   = SE->_permute;			\
+  auto local  = SE->_is_local;					\
-  if ( local ) {				\
+  auto perm   = SE->_permute;					\
-    LOAD_CHIMU(PERM);				\
+  if ( local ) {						\
-    PROJ;					\
+    LOAD_CHIMU(PERM);						\
-    if ( perm) {				\
+    PROJ;							\
-      PERMUTE_DIR(PERM);			\
+    if ( perm) {						\
-    }						\
+      PERMUTE_DIR(PERM);					\
-  } else if ( st.same_node[DIR] ) {		\
+    }								\
-    LOAD_CHI;					\
+  } else if ( st.same_node[DIR] ) {				\
-  }						\
+    LOAD_CHI;							\
-  acceleratorSynchronise();			\
+  }								\
-  if (local || st.same_node[DIR] ) {		\
+  acceleratorSynchronise();					\
-    MULT_2SPIN(DIR);				\
+  if (local || st.same_node[DIR] ) {				\
-    RECON;					\
+    MULT_2SPIN(DIR);						\
-  }						\
+    RECON;							\
-  acceleratorSynchronise();			
+  }								\
  acceleratorSynchronise();			}
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+  auto offset = SE->_offset;				\
-    LOAD_CHI;					\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
-    MULT_2SPIN(DIR);				\
+    LOAD_CHI;						\
-    RECON;					\
+    MULT_2SPIN(DIR);					\
-    nmu++;					\
+    RECON;						\
-  }						\
+    nmu++;						\
-  acceleratorSynchronise();			
+  }							\
  acceleratorSynchronise();			}
-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss)					\
-  {						\
+  {							\
-    SiteSpinor & ref (out[ss]);			\
+    SiteSpinor & ref (out[ss]);				\
    coalescedWrite(ref()(0)(0),result_00,lane);		\
    coalescedWrite(ref()(0)(1),result_01,lane);		\
    coalescedWrite(ref()(0)(2),result_02,lane);		\
@@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1,41 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CompactWilsonCloverFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
@@ -0,0 +1 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
@@ -0,0 +1 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -40,7 +40,7 @@ EOF
 done
-CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
 for impl in $WILSON_IMPL_LIST
 do
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -69,6 +69,11 @@ public:
    return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Same as Cshift for periodic BCs
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline bool isPeriodicGaugeField(void) { return true; }
 };
@@ -110,6 +115,11 @@ public:
      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }
  //If mu is a conjugate BC direction
  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
  //       = U^T_\mu(L-1)  | x_\mu == 0
  //else
  //Out(x) = U^dag_\mu(x-mu mod L)
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
  {
@@ -129,6 +139,13 @@ public:
      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }
  //If mu is a conjugate BC direction
  //Out(x) = S_\mu(x+mu)  | x_\mu != L-1
  //       = S*_\mu(x+mu)  | x_\mu == L-1
  //else
  //Out(x) = S_\mu(x+mu mod L)
  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
  {
    assert(_conjDirs.size() == Nd);
@@ -138,6 +155,27 @@ public:
      return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  //For conjugate BC direction
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = U*_\mu(0)  | x_\mu == L-1
  //shift = -1
  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
  //       = U*_\mu(L-1)  | x_\mu == 0
  //else
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu mod L)
  //shift = -1
  //Out(x) = U_\mu(x-\hat\mu mod L)
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    assert(_conjDirs.size() == Nd);
    if(_conjDirs[mu]) 
      return ConjugateBC::CshiftLink(Link,mu,shift);
    else     
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@@ -40,13 +40,66 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }
    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
       for noise X (aka GaussNoise).
       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
    */
    template<class Field> void InversePowerBoundsCheck(int inv_pow,
 						       int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
 						       Field &GaussNoise,
 						       MultiShiftFunction &ApproxNegPow) 
    {
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Y(FermionGrid);
      Field Z(FermionGrid);
      Field tmp1(FermionGrid), tmp2(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
      tmp1 = X;
      Field* in = &tmp1;
      Field* out = &tmp2;
      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
 	msCG(HermOp, *in, *out); //backwards conventions!
 	if(i!=inv_pow-1) std::swap(in, out);
      }
      Z = *out;
      RealD Nz = norm2(Z);
      HermOp.HermOp(Z,Y);
      RealD Ny = norm2(Y);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
    }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -44,6 +44,10 @@ NAMESPACE_BEGIN(Grid);
  // Exact one flavour implementation of DWF determinant ratio //
  ///////////////////////////////////////////////////////////////
  //Note: using mixed prec CG for the heatbath solver in this action class will not work
  //      because the L, R operators must have their shift coefficients updated throughout the heatbath step
  //      You will find that the heatbath solver simply won't converge.
  //      To use mixed precision here use the ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction variant below
  template<class Impl>
  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
  {
@@ -57,37 +61,60 @@ NAMESPACE_BEGIN(Grid);
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> SolverHB;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBR;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverR;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverR;
      FermionField Phi; // the pseudofermion field for this trajectory
      RealD norm2_eta; //|eta|^2 where eta is the random gaussian field used to generate the pseudofermion field
      bool initial_action; //true for the first call to S after refresh, for which the identity S = |eta|^2 holds provided the rational approx is good
    public:
      //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
      virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
 	AbstractEOFAFermion<Impl>&op = LorR == 0 ? Lop : Rop;
 	op.RefreshShiftCoefficients(to);
      }
      //Use the same solver for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& CG, 
 					      Params& p, 
 					      bool use_fc=false) 
-	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,p,use_fc) {};
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,CG,p,use_fc) {};
-	
+
      //Use the same solver for L,R in the heatbath but different solvers elsewhere
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
-					      OperatorFunction<FermionField>& HeatbathCG, 
+					      OperatorFunction<FermionField>& HeatbathCG,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false)
 	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,HeatbathCG,HeatbathCG, ActionCGL, ActionCGR, DerivCGL,DerivCGR,p,use_fc) {};
      //Use different solvers for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false) : 
        Lop(_Lop), 
 	Rop(_Rop), 
-	SolverHB(HeatbathCG,false,true),
+	SolverHBL(HeatbathCGL,false,true), SolverHBR(HeatbathCGR,false,true),
 	SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), 
 	DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), 
 	Phi(_Lop.FermionGrid()), 
 	param(p), 
-        use_heatbath_forecasting(use_fc)
+	use_heatbath_forecasting(use_fc),
 	initial_action(false)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);
@@ -97,6 +124,8 @@ NAMESPACE_BEGIN(Grid);
        PowerNegHalf.Init(remez, param.tolerance, true);
      };
      const FermionField &getPhi() const{ return Phi; }
      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
      virtual std::string LogParameters() {
@@ -117,6 +146,19 @@ NAMESPACE_BEGIN(Grid);
        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
        // 
        RealD scale = std::sqrt(0.5);
        FermionField eta    (Lop.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
@@ -124,12 +166,10 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
+     void refresh(const GaugeField &U, const FermionField &eta) {
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField eta         (Lop.FermionGrid());
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        FermionField Forecast_src(Lop.FermionGrid());
@@ -140,11 +180,6 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
        // Seed with Gaussian noise vector (var = 0.5)
        RealD scale = std::sqrt(0.5);
        gaussian(pRNG,eta);
        eta = eta * scale;
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
@@ -160,15 +195,16 @@ NAMESPACE_BEGIN(Grid);
        tmp[1] = Zero();
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Lop.RefreshShiftCoefficients(-gamma_l);
+          heatbathRefreshShiftCoefficients(0, -gamma_l);
 	  //Lop.RefreshShiftCoefficients(-gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            SolverHB(Lop, CG_src, CG_soln);
+            SolverHBL(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero(); // Just use zero as the initial guess
-            SolverHB(Lop, CG_src, CG_soln);
+	    SolverHBL(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
@@ -187,15 +223,16 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
+	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
          //Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero();
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
@@ -205,49 +242,119 @@ NAMESPACE_BEGIN(Grid);
        Phi = Phi + tmp[1];
        // Reset shift coefficients for energy and force evals
-        Lop.RefreshShiftCoefficients(0.0);
+        //Lop.RefreshShiftCoefficients(0.0);
-        Rop.RefreshShiftCoefficients(-1.0);
+        //Rop.RefreshShiftCoefficients(-1.0);
 	heatbathRefreshShiftCoefficients(0, 0.0);
 	heatbathRefreshShiftCoefficients(1, -1.0);
 	//Mark that the next call to S is the first after refresh
 	initial_action = true;
 	// Bounds check
 	RealD EtaDagEta = norm2(eta);
 	norm2_eta = EtaDagEta;
 	//	RealD PhiDagMPhi= norm2(eta);
      };
-      void Meofa(const GaugeField& U,const FermionField &phi, FermionField & Mphi) 
+      void Meofa(const GaugeField& U,const FermionField &in, FermionField & out) 
      {
 #if 0
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
-        FermionField spProj_Phi(Lop.FermionGrid());
+        FermionField spProj_in(Lop.FermionGrid());
 	FermionField mPhi(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-	mPhi = phi;
+	out = in;
        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        spProj(in, spProj_in, -1, Lop.Ls);
-        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        Lop.Omega(spProj_in, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverL(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
-	mPhi = mPhi -  Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], -1, Lop.Ls);
 	out = out -  Lop.k * tmp[1];
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
-        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        spProj(in, spProj_in, 1, Rop.Ls);
-        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        Rop.Omega(spProj_in, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverR(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
-        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], 1, Rop.Ls);
-#endif
+
        out = out + Rop.k * tmp[1];
      }
      //Due to the structure of EOFA, it is no more expensive to compute the inverse of Meofa
      //To ensure correctness we can simply reuse the heatbath code but use the rational approx
      //f(x) = 1/x   which corresponds to alpha_0=0,  alpha_1=1,  beta_1=0 => gamma_1=1
      void MeofaInv(const GaugeField &U, const FermionField &in, FermionField &out) {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
 	// = 1 * \eta
        out = in;
        // LH terms:
        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
        spProj(in, tmp[0], -1, Lop.Ls);
        Lop.Omega(tmp[0], tmp[1], -1, 0);
        G5R5(CG_src, tmp[1]);
        {
          heatbathRefreshShiftCoefficients(0, -1.); //-gamma_1 = -1.
 	  CG_soln = Zero(); // Just use zero as the initial guess
 	  SolverHBL(Lop, CG_src, CG_soln);
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = Lop.k * tmp[0];
        }
        Lop.Omega(tmp[1], tmp[0], -1, 1);
        spProj(tmp[0], tmp[1], -1, Lop.Ls);
        out = out + tmp[1];
        // RH terms:
        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
        //          - \beta_l\gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
        spProj(in, tmp[0], 1, Rop.Ls);
        Rop.Omega(tmp[0], tmp[1], 1, 0);
        G5R5(CG_src, tmp[1]);
        {
 	  heatbathRefreshShiftCoefficients(1, 0.); //-gamma_1 * beta_1 = 0
 	  CG_soln = Zero();
 	  SolverHBR(Rop, CG_src, CG_soln);
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = - Rop.k * tmp[0];
        }
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        spProj(tmp[0], tmp[1], 1, Rop.Ls);
        out = out + tmp[1];
        // Reset shift coefficients for energy and force evals
 	heatbathRefreshShiftCoefficients(0, 0.0);
 	heatbathRefreshShiftCoefficients(1, -1.0);
      };
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
@@ -271,7 +378,7 @@ NAMESPACE_BEGIN(Grid);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
@@ -281,6 +388,26 @@ NAMESPACE_BEGIN(Grid);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
 	if(initial_action){
 	  //For the first call to S after refresh,  S = |eta|^2. We can use this to ensure the rational approx is good
 	  RealD diff = action - norm2_eta;
 	  //S_init = eta^dag M^{-1/2} M M^{-1/2} eta
 	  //S_init - eta^dag eta =  eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta
 	  //If approximate solution
 	  //S_init - eta^dag eta =  eta^dag ( [M^{-1/2}+\delta M^{-1/2}] M [M^{-1/2}+\delta M^{-1/2}] - 1 ) eta
 	  //               \approx  eta^dag ( \delta M^{-1/2} M^{1/2} + M^{1/2}\delta M^{-1/2} ) eta
 	  // We divide out |eta|^2 to remove source scaling but the tolerance on this check should still be somewhat higher than the actual approx tolerance
 	  RealD test = fabs(diff)/norm2_eta; //test the quality of the rational approx
 	  std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl;
 	  std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << "  expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl;
 	  assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
 	  initial_action = false;
 	}
        return action;
      };
@@ -329,6 +456,40 @@ NAMESPACE_BEGIN(Grid);
      };
  };
  template<class ImplD, class ImplF>
  class ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction : public ExactOneFlavourRatioPseudoFermionAction<ImplD>{
  public:
    INHERIT_IMPL_TYPES(ImplD);
    typedef OneFlavourRationalParams Params;
  private:
    AbstractEOFAFermion<ImplF>& LopF; // the basic LH operator
    AbstractEOFAFermion<ImplF>& RopF; // the basic RH operator
  public:
    virtual std::string action_name() { return "ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction"; }
    //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
    virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
      AbstractEOFAFermion<ImplF> &op = LorR == 0 ? LopF : RopF;
      op.RefreshShiftCoefficients(to);
      this->ExactOneFlavourRatioPseudoFermionAction<ImplD>::heatbathRefreshShiftCoefficients(LorR,to);
    }
    ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction(AbstractEOFAFermion<ImplF>& _LopF, 
 							     AbstractEOFAFermion<ImplF>& _RopF,
 							     AbstractEOFAFermion<ImplD>& _LopD, 
 							     AbstractEOFAFermion<ImplD>& _RopD,
 							     OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 							     OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 							     OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 							     Params& p, 
 							     bool use_fc=false) : 
    LopF(_LopF), RopF(_RopF), ExactOneFlavourRatioPseudoFermionAction<ImplD>(_LopD, _RopD, HeatbathCGL, HeatbathCGR, ActionCGL, ActionCGR, DerivCGL, DerivCGR, p, use_fc){}
  };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -0,0 +1,372 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators
    /////////////////////////////////////////////////////////
    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
 	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
 	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
 	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
       BIG WARNING:	   
       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
       Thus for DWF the numerator operator is the Pauli-Villars operator
       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
    */
    template<class Impl>
    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef RationalActionParams Params;
      Params param;
      //For action evaluation
      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
      //For the MD integration
      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
 	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
 	double error = remez.generateApprox(approx_degree,1,inv_pow);	
 	if(error > CG_tolerance)
 	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
 	approx.Init(remez, CG_tolerance,false);
 	approx_inv.Init(remez, CG_tolerance,true);
      }
    protected:
      static constexpr bool Numerator = true;
      static constexpr bool Denominator = false;
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const GaugeField &U){
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
      }
    public:
      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						     FermionOperator<Impl>  &_DenOp, 
 						     const Params & p
 						     ) : 
 	NumOp(_NumOp), 
 	DenOp(_DenOp), 
 	PhiOdd (_NumOp.FermionRedBlackGrid()),
 	PhiEven(_NumOp.FermionRedBlackGrid()),
 	param(p) 
      {
 	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	//Generate approximations for action eval
 	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	//Generate approximations for MD
 	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
 	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	}else{
 	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
 	  ApproxPowerMD = ApproxPowerAction; 
 	  ApproxNegPowerMD = ApproxNegPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
 	  ApproxHalfPowerMD = ApproxHalfPowerAction;
 	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
 	}
 	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
      };
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      //Access the fermion field
      const FermionField &getPhiOdd() const{ return PhiOdd; }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField eta(NumOp.FermionGrid());	
 	// P(eta) \propto e^{- eta^dag eta}
 	//	
 	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
 	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
 	RealD scale = std::sqrt(0.5);
 	gaussian(pRNG,eta);	eta=eta*scale;
 	refresh(U,eta);
      }
      //Allow for manual specification of random field for testing
      void refresh(const GaugeField &U, const FermionField &eta) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
 	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
 	//
 	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	ImportGauge(U);
 	// MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
 	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
 	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
 	ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
 	// Randomly apply rational bounds checks.
 	int rcheck = rand();
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
 	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
 	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
 	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
 	}
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	RealD action = norm2(Y);
 	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
 	const int n_f  = ApproxNegPowerMD.poles.size();
 	const int n_pv = ApproxHalfPowerMD.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	ImportGauge(U);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)	
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
 	for(int k=0;k<n_f;k++){
 	  ak = ApproxNegPowerMD.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
 	for(int k=0;k<n_pv;k++){
          ak = ApproxHalfPowerMD.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
 	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
      };
    };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@@ -0,0 +1,93 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
    // cf. GeneralEvenOddRational.h for details
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class ImplD, class ImplF>
    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
    private:
      typedef typename ImplD::FermionField FermionFieldD;
      typedef typename ImplF::FermionField FermionFieldF;
      FermionOperator<ImplD> & NumOpD;
      FermionOperator<ImplD> & DenOpD;
      FermionOperator<ImplF> & NumOpF;
      FermionOperator<ImplF> & DenOpF;
      Integer ReliableUpdateFreq;
    protected:
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
 	precisionChange(Uf, Ud);
 	NumOpD.ImportGauge(Ud);
 	DenOpD.ImportGauge(Ud);
 	NumOpF.ImportGauge(Uf);
 	DenOpF.ImportGauge(Uf);
      }
    public:
      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
 							      const RationalActionParams & p, Integer _ReliableUpdateFreq
 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
 								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
    };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -40,249 +40,31 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
-     
+      static RationalActionParams transcribe(const Params &in){
-      FermionOperator<Impl> & NumOp;// the basic operator
+	RationalActionParams out;
-      FermionOperator<Impl> & DenOp;// the basic operator
+	out.inv_pow = 2;
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
+	out.lo = in.lo;
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+	out.hi = in.hi;
 	out.MaxIter = in.MaxIter;
 	out.action_tolerance = out.md_tolerance = in.tolerance;
 	out.action_degree = out.md_degree = in.degree;
 	out.precision = in.precision;
 	out.BoundsCheckFreq = in.BoundsCheckFreq;
 	return out;
      }
    public:
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-					    FermionOperator<Impl>  &_DenOp, 
+							FermionOperator<Impl>  &_DenOp, 
-					    Params & p
+							const Params & p
-					    ) : 
+							) : 
-      NumOp(_NumOp), 
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}
      DenOp(_DenOp), 
      PhiOdd (_NumOp.FermionRedBlackGrid()),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
-	// MdagM^(+- 1/2)
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
 	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
 	//
 	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);	eta=eta*scale;
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
 	msCG_M(MdagM,etaOdd,tmp);
 	// VdagV^-1/4 MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
 	msCG_V(VdagV,tmp,PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	msCG_V(VdagV,PhiOdd,X);
 	// MdagM^-1/4 VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);
 	// Randomly apply rational bounds checks.
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
 	}
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int n_f  = PowerNegHalf.poles.size();
 	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)
 	for(int k=0;k<n_f;k++){
 	  ak = PowerNegHalf.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	for(int k=0;k<n_pv;k++){
          ak = PowerQuarter.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
      };
    };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@@ -40,6 +40,8 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -83,16 +83,10 @@ NAMESPACE_BEGIN(Grid);
 	return sstream.str();
      } 
-      
+      //Access the fermion field
-      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+      const FermionField &getPhiOdd() const{ return PhiOdd; }
-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        //
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@@ -100,12 +94,22 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);
        FermionField eta    (NumOp.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      void refresh(const GaugeField &U, const FermionField &eta) {
        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());
        gaussian(pRNG,eta);
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);
@@ -125,8 +129,8 @@ NAMESPACE_BEGIN(Grid);
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);
-        PhiOdd =PhiOdd*scale;
+        //PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
+        //PhiEven=PhiEven*scale;
      };
--- a/Grid/qcd/gparity/Gparity.h
+++ b/Grid/qcd/gparity/Gparity.h
@@ -0,0 +1,6 @@
 #ifndef GRID_GPARITY_H_
 #define GRID_GPARITY_H_
 #include<Grid/qcd/gparity/GparityFlavour.h>
 #endif
--- a/Grid/qcd/gparity/GparityFlavour.cc
+++ b/Grid/qcd/gparity/GparityFlavour.cc
@@ -0,0 +1,34 @@
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
    GparityFlavour(GparityFlavour::Algebra::SigmaX),
    GparityFlavour(GparityFlavour::Algebra::SigmaY),
    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
    }};
 const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
  GparityFlavour(GparityFlavour::Algebra::Identity),
  GparityFlavour(GparityFlavour::Algebra::SigmaX),
  GparityFlavour(GparityFlavour::Algebra::SigmaY),
  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
 }};
 const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
    "SigmaX",
    "MinusSigmaX",
    "SigmaY",
    "MinusSigmaY",
    "SigmaZ",
    "MinusSigmaZ",
    "Identity",
    "MinusIdentity",
    "ProjPlus",
    "MinusProjPlus",
    "ProjMinus",
    "MinusProjMinus"}};
 NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/GparityFlavour.h
+++ b/Grid/qcd/gparity/GparityFlavour.h
@@ -0,0 +1,475 @@
 #ifndef GRID_QCD_GPARITY_FLAVOUR_H
 #define GRID_QCD_GPARITY_FLAVOUR_H
 //Support for flavour-matrix operations acting on the G-parity flavour index
 #include <array>
 NAMESPACE_BEGIN(Grid);
 class GparityFlavour {
  public:
    GRID_SERIALIZABLE_ENUM(Algebra, undef,
                           SigmaX, 0,
 			   MinusSigmaX, 1,
                           SigmaY, 2,
 			   MinusSigmaY, 3,
                           SigmaZ, 4,
 			   MinusSigmaZ, 5,
 			   Identity, 6,
 			   MinusIdentity, 7,
 			   ProjPlus, 8,
 			   MinusProjPlus, 9,
 			   ProjMinus, 10,
 			   MinusProjMinus, 11
 			   );
    static constexpr unsigned int nSigma = 12;
    static const std::array<const char *, nSigma>                name;
    static const std::array<const GparityFlavour, 3>             sigma_mu;
    static const std::array<const GparityFlavour, 6>            sigma_all;
    Algebra                                                      g;
  public:
  accelerator GparityFlavour(Algebra initg): g(initg) {}  
 };
 // 0 1  x   vector
 // 1 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(1);
  ret(1) = rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(1,0);
  ret(0,1) = rhs(1,1);
  ret(1,0) = rhs(0,0);
  ret(1,1) = rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,1);
  ret(0,1) = rhs(0,0);
  ret(1,0) = rhs(1,1);
  ret(1,1) = rhs(1,0);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(1);
  ret(1) = -rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(1,0);
  ret(0,1) = -rhs(1,1);
  ret(1,0) = -rhs(0,0);
  ret(1,1) = -rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,1);
  ret(0,1) = -rhs(0,0);
  ret(1,0) = -rhs(1,1);
  ret(1,1) = -rhs(1,0);
 };
 // 0 -i  x   vector
 // i 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesMinusI(rhs(1));
  ret(1) = timesI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(1,0));
  ret(0,1) = timesMinusI(rhs(1,1));
  ret(1,0) = timesI(rhs(0,0));
  ret(1,1) = timesI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(0,1));
  ret(0,1) = timesMinusI(rhs(0,0));
  ret(1,0) = timesI(rhs(1,1));
  ret(1,1) = timesMinusI(rhs(1,0));
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesI(rhs(1));
  ret(1) = timesMinusI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(1,0));
  ret(0,1) = timesI(rhs(1,1));
  ret(1,0) = timesMinusI(rhs(0,0));
  ret(1,1) = timesMinusI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(0,1));
  ret(0,1) = timesI(rhs(0,0));
  ret(1,0) = timesMinusI(rhs(1,1));
  ret(1,1) = timesI(rhs(1,0));
 };
 // 1 0  x   vector
 // 0 -1
 template<class vtype>
 accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 //G-parity flavour projection 1/2(1+\sigma_2)
 //1 -i
 //i  1
 template<class vtype>
 accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 //G-parity flavour projection 1/2(1-\sigma_2)
 //1 i
 //-i  1
 template<class vtype>
 accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
 {
  iVector<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    multFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    multFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    multFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    multFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    multFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    multFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    multFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    multFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    multFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    multFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    multFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    multFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    lmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    lmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    lmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    lmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    lmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    lmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    lmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    lmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    lmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    lmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    lmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    lmultFlavourMinusProjMinus(ret, arg); break;  
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    rmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    rmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    rmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    rmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    rmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    rmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    rmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    rmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    rmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    rmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    rmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    rmultFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 NAMESPACE_END(Grid);
 #endif // include guard
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -129,18 +129,10 @@ public:
    Runner(S);
  }
-  //////////////////////////////////////////////////////////////////
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
-
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
-private:
+  void initializeGaugeFieldAndRNGs(Field &U){
-  template <class SmearingPolicy>
+    if(!Resources.haveRNGs()) Resources.AddRNGs();
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Resources.AddRNGs();
    Field U(UGrid);
    // Can move this outside?
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    if (Parameters.StartingType == "HotStart") {
      // Hot start
@@ -159,14 +151,40 @@ private:
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
    } else if (Parameters.StartingType == "CheckpointStartReseed") {
      // Same as CheckpointRestart but reseed the RNGs using the fixed integer seeding used for ColdStart and HotStart
      // Useful for creating new evolution streams from an existing stream
      // WARNING: Unfortunately because the checkpointer doesn't presently allow us to separately restore the RNG and gauge fields we have to load
      // an existing RNG checkpoint first; make sure one is available and named correctly
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
      Resources.SeedFixedIntegers();      
    } else {
      // others
      std::cout << GridLogError << "Unrecognized StartingType\n";
      std::cout
 	<< GridLogError
-	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart, CheckpointStartReseed]\n";
      exit(1);
    }
  }
  //////////////////////////////////////////////////////////////////
 private:
  template <class SmearingPolicy>
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Field U(UGrid);
    initializeGaugeFieldAndRNGs(U);
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    Smearing.set_Field(U);
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -115,21 +115,21 @@ private:
    random(sRNG, rn_test);
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
+    std::cout << GridLogHMC << "exp(-dH) = " << prob
              << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return false;
    }
@@ -145,7 +145,7 @@ private:
    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);
    TheIntegrator.integrate(U);
@@ -165,7 +165,7 @@ private:
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+    std::cout << GridLogHMC << "Total H after trajectory  = " << H1
 	      << "  dH = " << H1 - H0 << "\n";
    std::cout.precision(current_precision);
@@ -196,9 +196,9 @@ public:
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
      double t0=usecond();
@@ -207,10 +207,10 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
      }
      if (accept)
@@ -219,7 +219,7 @@ public:
      double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
      for (int obs = 0; obs < Observables.size(); obs++) {
@@ -228,7 +228,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }
--- a/Grid/qcd/hmc/HMCModules.h
+++ b/Grid/qcd/hmc/HMCModules.h
@@ -80,7 +80,9 @@ public:
      std::cout << GridLogError << "Seeds not initialized" << std::endl;
      exit(1);
    }
    std::cout << GridLogMessage << "Reseeding serial RNG with seed vector " << SerialSeeds << std::endl;
    sRNG_.SeedFixedIntegers(SerialSeeds);
    std::cout << GridLogMessage << "Reseeding parallel RNG with seed vector " << ParallelSeeds << std::endl;
    pRNG_->SeedFixedIntegers(ParallelSeeds);
  }
 };
--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@@ -226,6 +226,9 @@ public:
  //////////////////////////////////////////////////////
  // Random number generators
  //////////////////////////////////////////////////////
  //Return true if the RNG objects have been instantiated
  bool haveRNGs() const{ return have_RNG; }
  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
--- a/Grid/qcd/hmc/UsingHMC.md
+++ b/Grid/qcd/hmc/UsingHMC.md
@@ -1,61 +1,63 @@
-Using HMC in Grid version 0.5.1
+# Using HMC in Grid
-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
+These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
+Disclaimer: Grid is still under active development so any information here can be changed in future releases.
-Command line options
+## Command line options
-===================
+
-(relevant file GenericHMCrunner.h)
+(relevant file `GenericHMCrunner.h`)
 The initial configuration can be changed at the command line using 
--StartType <your choice>
+`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
-valid choices, one among these
+`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
-HotStart, ColdStart, TepidStart, CheckpointStart
+Default: `--StartingType HotStart`
 default: HotStart
-example
+Example:
-./My_hmc_exec  --StartType HotStart
+```
 ./My_hmc_exec  --StartingType HotStart
 ```
-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
+The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
+`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
-default: 0
+Default: `--StartingTrajectory 0`
 The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
+`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
-default: 1
+Default: `--Trajectories 1`
 The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
+`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
-default: 10
+Default: `--Thermalizations 10`
 Any other parameter is defined in the source for the executable.
-HMC controls
+## HMC controls
 ===========
 The lines 
 ```
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
  std::vector<int> ParSeed({6, 7, 8, 9, 10});
 ```
 define the seeds for the serial and the parallel RNG.
 The line 
 ```
  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
 ```
 declares the number of molecular dynamics steps and the total trajectory length.
-Actions
+## Actions
 ======
-Action names are defined in the file
+Action names are defined in the directory `Grid/qcd/action`.
 lib/qcd/Actions.h
-Gauge actions list:
+Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):
 ```
 WilsonGaugeActionR;
 WilsonGaugeActionF;
 WilsonGaugeActionD;
@@ -68,8 +70,9 @@ IwasakiGaugeActionD;
 SymanzikGaugeActionR;
 SymanzikGaugeActionF;
 SymanzikGaugeActionD;
 ```
-
+```
 ConjugateWilsonGaugeActionR;
 ConjugateWilsonGaugeActionF;
 ConjugateWilsonGaugeActionD;
@@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
 ConjugateSymanzikGaugeActionR;
 ConjugateSymanzikGaugeActionF;
 ConjugateSymanzikGaugeActionD;
 ```
 Each of these action accepts one single parameter at creation time (beta).
 Example for creating a Symanzik action with beta=4.0
 ```
  SymanzikGaugeActionR(4.0)
 ```
 Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
 ```
 ScalarActionR;
 ScalarActionF;
 ScalarActionD;
 ```
-
+The suffixes `R`, `F`, `D` in the action names refer to the `Real`
-each of these action accept one single parameter at creation time (beta).
+(the precision is defined at compile time by the `--enable-precision` flag in the configure),
-Example for creating a Symanzik action with beta=4.0
+`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
 	SymanzikGaugeActionR(4.0)
 The suffixes R,F,D in the action names refer to the Real
 (the precision is defined at compile time by the --enable-precision flag in the configure),
 Float and Double, that force the precision of the action to be 32, 64 bit respectively.
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -136,8 +136,14 @@ protected:
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
+
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real max_force_abs = std::sqrt(maxLocalNorm2(force));
      Real max_impulse_abs = max_force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << " Max force: " << max_force_abs << " Time step: " << ep << " Impulse average: " << impulse_abs << " Max impulse: " << max_impulse_abs << std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
@@ -249,15 +255,19 @@ public:
  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
    assert(P.Grid() == U.Grid());
-    std::cout << GridLogIntegrator << "Integrator refresh\n";
+    std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;
    std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
    FieldImplementation::generate_momenta(P, sRNG, pRNG);
    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
    // of the Metropolis
    std::cout << GridLogIntegrator << "Updating smeared fields" << std::endl;
    Smearer.set_Field(U);
    // Set the (eventual) representations gauge fields
    std::cout << GridLogIntegrator << "Updating representations" << std::endl;
    Representations.update(U);
    // The Smearer is attached to a pointer of the gauge field
@@ -267,6 +277,7 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
 	std::cout << GridLogIntegrator << "Refreshing integrator level " << level << " index " << actionID << std::endl;
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
      }
--- a/Grid/qcd/observables/topological_charge.h
+++ b/Grid/qcd/observables/topological_charge.h
@@ -99,7 +99,7 @@ public:
 	// using wilson flow by default here
 	WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 	WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
-	Real T0   = WF.energyDensityPlaquette(Usmear);
+	Real T0   = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
 	std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 		  << "T0                : [ " << traj << " ] "<< T0 << std::endl;
      }
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -7,6 +7,7 @@ Source file: ./lib/qcd/modules/plaquette.h
 Copyright (C) 2017
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,28 +34,44 @@ NAMESPACE_BEGIN(Grid);
 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
 public:
  //Store generic measurements to take during smearing process using std::function
  typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
 private:
  unsigned int Nstep;
-  unsigned int measure_interval;
+  RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step
-  mutable RealD epsilon, taus;
+ 
-
+  std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency
  mutable WilsonGaugeAction<Gimpl> SG;
-  void evolve_step(typename Gimpl::GaugeField&) const;
+  //Evolve the gauge field by 1 step and update tau
-  void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
+  void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
-  RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
+  //Evolve the gauge field by 1 step and update tau and the current time step eps
  void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const;
 public:
  INHERIT_GIMPL_TYPES(Gimpl)
  void resetActions(){ functions.clear(); }
  void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
  //Set the class to perform the default measurements: 
  //the plaquette energy density every step
  //the plaquette topological charge every 'topq_meas_interval' steps
  //and output to stdout
  void setDefaultMeasurements(int topq_meas_interval = 1);
  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
  Nstep(Nstep),
    epsilon(epsilon),
    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
    assert(epsilon > 0.0);
    LogMessage();
    setDefaultMeasurements(interval);
  }
  void LogMessage() {
@@ -73,9 +90,29 @@ public:
    // undefined for WilsonFlow
  }
-  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
+  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const;
-  RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
+
-  RealD energyDensityPlaquette(const GaugeField& U) const;
+  //Compute t^2 <E(t)> for time t from the plaquette
  static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
  //Compute t^2 <E(t)> for time t from the 1x1 cloverleaf form
  //t is the Wilson flow time
  static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U);
  //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps
  //The smeared field is output as V
  std::vector<RealD> flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1);
  //Version that does not return the smeared field
  std::vector<RealD> flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1);
  //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps
  //The smeared field is output as V
  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1);
  //Version that does not return the smeared field
  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
 };
@@ -83,7 +120,7 @@ public:
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
+void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
  GaugeField Z(U.Grid());
  GaugeField tmp(U.Grid());
  SG.deriv(U, Z);
@@ -99,12 +136,13 @@ void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
  tau += epsilon;
 }
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
+void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{
-  if (maxTau - taus < epsilon){
+  if (maxTau - tau < eps){
-    epsilon = maxTau-taus;
+    eps = maxTau-tau;
  }
  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
@@ -114,95 +152,151 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
+  Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0
  Z *= -17.0/8.0;
  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
+  Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
  Z *= -4.0/3.0;
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2
  // Ramos 
-  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
+  Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
  RealD diff = norm2(diffU);
  // adjust integration step
-  taus += epsilon;
+  tau += eps;
  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
-  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
+  eps = eps*0.95*std::pow(1e-4/diff,1./3.);
  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 }
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
+RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
-  RealD td = tau(step);
+  static WilsonGaugeAction<Gimpl> SG(3.0);
-  return 2.0 * td * td * SG.S(U)/U.Grid()->gSites();
+  return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
 }
 //Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
 template <class Gimpl>
 RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  assert(Nd == 4);
  //E = 1/2 tr( F_munu F_munu )
  //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
  //F_01 F_02 F_03   F_12 F_13  F_23
  GaugeMat F(U.Grid());
  LatticeComplexD R(U.Grid());
  R = Zero();
  for(int mu=0;mu<3;mu++){
    for(int nu=mu+1;nu<4;nu++){
      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
      R = R + trace(F*F);
    }
  }
  ComplexD out = sum(R);
  out = t*t*out / RealD(U.Grid()->gSites());
  return -real(out); //minus sign necessary for +ve energy
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
  std::vector<RealD> out;
  resetActions();
  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
      std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl;
      out.push_back( energyDensityPlaquette(t,U) );
    });      
  smear(V,U);
  return out;
 }
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
-  return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites();
+  GaugeField V(U);
  return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
  std::vector<RealD> out;
  resetActions();
  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
      std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl;
      out.push_back( energyDensityCloverleaf(t,U) );
    });      
  smear(V,U);
  return out;
 }
 template <class Gimpl>
 std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
  GaugeField V(U);
  return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
 }
 //#define WF_TIMING 
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
+void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
  out = in;
-  for (unsigned int step = 1; step <= Nstep; step++) {
+  RealD taus = 0.;
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
-    evolve_step(out);
+    evolve_step(out, taus);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
 #ifdef WF_TIMING
    std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+    //Perform measurements
-		  << step << "  " << tau(step) << "  " 
+    for(auto const &meas : functions)
-	      << energyDensityPlaquette(step,out) << std::endl;
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
    if( step % measure_interval == 0){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
 		<< step << "  " 
 		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
    }
  }
 }
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
+void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{
  out = in;
-  taus = epsilon;
+  RealD taus = 0.;
  RealD eps = epsilon;
  unsigned int step = 0;
  do{
    step++;
    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
-    evolve_step_adaptive(out, maxTau);
+    evolve_step_adaptive(out, taus, eps, maxTau);
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
+    //Perform measurements
-		  << step << "  " << taus << "  "
+    for(auto const &meas : functions)
-	      << energyDensityPlaquette(out) << std::endl;
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
    if( step % measure_interval == 0){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
 		<< step << "  " 
 		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
    }
  } while (taus < maxTau);
 }
 template <class Gimpl>
 void WilsonFlow<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
    });
  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
    });
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -88,6 +88,12 @@ namespace PeriodicBC {
    return CovShiftBackward(Link,mu,arg);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  template<class gauge> Lattice<gauge>
  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
  {
    return Cshift(Link, mu, shift);
  }
 }
@@ -158,6 +164,9 @@ namespace ConjugateBC {
    //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
  //       = U^T_\mu(L-1)  | x_\mu == 0
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
    GridBase *grid = Link.Grid();
@@ -176,6 +185,9 @@ namespace ConjugateBC {
    return Link;
  }
  //Out(x) = S_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = S*_\mu(0)  | x_\mu == L-1
  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
@@ -208,6 +220,35 @@ namespace ConjugateBC {
    return CovShiftBackward(Link,mu,arg);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = U*_\mu(0)  | x_\mu == L-1
  //shift = -1
  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
  //       = U*_\mu(L-1)  | x_\mu == 0
  template<class gauge> Lattice<gauge>
  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
  {
    GridBase *grid = Link.Grid();
    int Lmu = grid->GlobalDimensions()[mu] - 1;
    Lattice<iScalar<vInteger>> coor(grid);
    LatticeCoordinate(coor, mu);
    Lattice<gauge> tmp(grid);
    if(shift == 1){
      tmp = Cshift(Link, mu, 1);
      tmp = where(coor == Lmu, conjugate(tmp), tmp);
      return tmp;
    }else if(shift == -1){
      tmp = Link;
      tmp = where(coor == Lmu, conjugate(tmp), tmp);
      return Cshift(tmp, mu, -1);
    }else assert(0 && "Invalid shift value");
    return tmp; //shuts up the compiler fussing about the return type
  }
 }
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@@ -40,27 +40,46 @@ public:
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
-  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
+  //A_\mu(x) = -i Ta(U_\mu(x) )   where Ta(U) = 1/2( U - U^dag ) - 1/2N tr(U - U^dag)  is the traceless antihermitian part. This is an O(A^3) approximation to the logarithm of U
-    for(int mu=0;mu<Nd;mu++){
+  static void GaugeLinkToLieAlgebraField(const GaugeMat &U, GaugeMat &A) {
-      Complex cmi(0.0,-1.0);
+    Complex cmi(0.0,-1.0);
-      A[mu] = Ta(U[mu]) * cmi;
+    A = Ta(U) * cmi;
    }
  }
-  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu,int orthog) {
+  
  //The derivative of the Lie algebra field
  static void DmuAmu(const std::vector<GaugeMat> &U, GaugeMat &dmuAmu,int orthog) {
    GridBase* grid = U[0].Grid();
    GaugeMat Ax(grid);
    GaugeMat Axm1(grid);
    GaugeMat Utmp(grid);
    dmuAmu=Zero();
    for(int mu=0;mu<Nd;mu++){
      if ( mu != orthog ) {
-	dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+	//Rather than define functionality to work out how the BCs apply to A_\mu we simply use the BC-aware Cshift to the gauge links and compute A_\mu(x) and A_\mu(x-1) separately
 	//Ax = A_\mu(x)
 	GaugeLinkToLieAlgebraField(U[mu], Ax);
 	//Axm1 = A_\mu(x_\mu-1)
 	Utmp = Gimpl::CshiftLink(U[mu], mu, -1);
 	GaugeLinkToLieAlgebraField(Utmp, Axm1);
 	//Derivative
 	dmuAmu = dmuAmu + Ax - Axm1;
      }
    }
  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  //Fix the gauge field Umu
  //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
  }
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+
  //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
    GridBase *grid = Umu.Grid();
@@ -122,27 +141,24 @@ public:
      }
    }
    assert(0 && "Gauge fixing did not converge within the specified number of iterations");
  };
-  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
-
+    ExpiAlphaDmuAmu(U,g,alpha,dmuAmu,orthog);
    GaugeLinkToLieAlgebraField(U,A);
    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu,orthog);
    Real vol = grid->gSites();
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);
    return trG;
  }
-  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
@@ -157,11 +173,7 @@ public:
    GaugeMat g(grid);
    GaugeMat dmuAmu_p(grid);
-    std::vector<GaugeMat> A(Nd,grid);
+    DmuAmu(U,dmuAmu,orthog);
    GaugeLinkToLieAlgebraField(U,A);
    DmuAmu(A,dmuAmu,orthog);
    std::vector<int> mask(Nd,1);
    for(int mu=0;mu<Nd;mu++) if (mu==orthog) mask[mu]=0;
@@ -205,16 +217,16 @@ public:
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);
    return trG;
  }
-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &U,GaugeMat &g, Real alpha, GaugeMat &dmuAmu,int orthog) {
    GridBase *grid = g.Grid();
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
-    DmuAmu(A,dmuAmu,orthog);
+    DmuAmu(U,dmuAmu,orthog);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
  }  
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@@ -694,32 +694,32 @@ public:
 * Adjoint rep gauge xform
 */
-  template<typename GaugeField,typename GaugeMat>
+  template<typename Gimpl>
-  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
+  static void GaugeTransform(typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = Umu.Grid();
    conformable(grid,g.Grid());
-    GaugeMat U(grid);
+    typename Gimpl::GaugeLinkField U(grid);
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
      U= PeekIndex<LorentzIndex>(Umu,mu);
-      U = g*U*Cshift(ag, mu, 1);
+      U = g*U*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
      PokeIndex<LorentzIndex>(Umu,U,mu);
    }
  }
-  template<typename GaugeMat>
+  template<typename Gimpl>
-  static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
+  static void GaugeTransform( std::vector<typename Gimpl::GaugeLinkField> &U, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = g.Grid();
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = g*U[mu]*Cshift(ag, mu, 1);
+      U[mu] = g*U[mu]*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
    }
  }
-  template<typename GaugeField,typename GaugeMat>
+  template<typename Gimpl>
-  static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){
+  static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    LieRandomize(pRNG,g,1.0);
-    GaugeTransform(Umu,g);
+    GaugeTransform<Gimpl>(Umu,g);
  }
  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -125,6 +125,56 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // sum over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static void siteSpatialPlaquette(ComplexField &Plaq,
                            const std::vector<GaugeMat> &U) {
    ComplexField sitePlaq(U[0].Grid());
    Plaq = Zero();
    for (int mu = 1; mu < Nd-1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
        Plaq = Plaq + sitePlaq;
      }
    }
  }
  ////////////////////////////////////
  // sum over all x,y,z and over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static std::vector<RealD> timesliceSumSpatialPlaquette(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(Nd, Umu.Grid());
    // inefficient here
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    ComplexField Plaq(Umu.Grid());
    siteSpatialPlaquette(Plaq, U);
    typedef typename ComplexField::scalar_object sobj;
    std::vector<sobj> Tq;
    sliceSum(Plaq, Tq, Nd-1);
    std::vector<Real> out(Tq.size());
    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
    return out;
  }
  //////////////////////////////////////////////////
  // average over all x,y,z and over all spatial planes of plaquette
  //////////////////////////////////////////////////
  static std::vector<RealD> timesliceAvgSpatialPlaquette(const GaugeLorentz &Umu) {
    std::vector<RealD> sumplaq = timesliceSumSpatialPlaquette(Umu);
    int Lt = Umu.Grid()->FullDimensions()[Nd-1];
    assert(sumplaq.size() == Lt);
    double vol = Umu.Grid()->gSites() / Lt;
    double faces = (1.0 * (Nd - 1)* (Nd - 2)) / 2.0;
    for(int t=0;t<Lt;t++)
      sumplaq[t] = sumplaq[t] / vol / faces / Nc; // Nd , Nc dependent... FIXME
    return sumplaq;
  }
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
@@ -363,11 +413,11 @@ public:
    GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
    GaugeMat vu = v*u;
      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
-      FS = (u*v + Cshift(vu, mu, -1));
+      FS = (u*v + Gimpl::CshiftLink(vu, mu, -1));
      FS = 0.125*(FS - adj(FS));
  }
-  static Real TopologicalCharge(GaugeLorentz &U){
+  static Real TopologicalCharge(const GaugeLorentz &U){
    // 4d topological charge
    assert(Nd==4);
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
@@ -390,6 +440,203 @@ public:
  }
  //Clover-leaf Wilson loop combination for arbitrary mu-extent M and nu extent N,  mu >= nu
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 7  for 1x2 Wilson loop    
  //Clockwise ordering
  static void CloverleafMxN(GaugeMat &FS, const GaugeMat &Umu, const GaugeMat &Unu, int mu, int nu, int M, int N){  
 #define Fmu(A) Gimpl::CovShiftForward(Umu, mu, A)
 #define Bmu(A) Gimpl::CovShiftBackward(Umu, mu, A)
 #define Fnu(A) Gimpl::CovShiftForward(Unu, nu, A)
 #define Bnu(A) Gimpl::CovShiftBackward(Unu, nu, A)
 #define FmuI Gimpl::CovShiftIdentityForward(Umu, mu)
 #define BmuI Gimpl::CovShiftIdentityBackward(Umu, mu)
 #define FnuI Gimpl::CovShiftIdentityForward(Unu, nu)
 #define BnuI Gimpl::CovShiftIdentityBackward(Unu, nu)
    //Upper right loop
    GaugeMat tmp = BmuI;
    for(int i=1;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    FS = tmp;
    //Upper left loop
    tmp = BnuI;
    for(int j=1;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    FS = FS + tmp;
    //Lower right loop
    tmp = FnuI;
    for(int j=1;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Fmu(tmp);
    FS = FS + tmp;
    //Lower left loop
    tmp = FmuI;
    for(int i=1;i<M;i++)
      tmp = Fmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Fnu(tmp);
    for(int i=0;i<M;i++)
      tmp = Bmu(tmp);
    for(int j=0;j<N;j++)
      tmp = Bnu(tmp);
    FS = FS + tmp;
 #undef Fmu
 #undef Bmu
 #undef Fnu
 #undef Bnu
 #undef FmuI
 #undef BmuI
 #undef FnuI
 #undef BnuI
  }
  //Field strength from MxN Wilson loop
  //Note F_numu = - F_munu
  static void FieldStrengthMxN(GaugeMat &FS, const GaugeLorentz &U, int mu, int nu, int M, int N){  
    GaugeMat Umu = PeekIndex<LorentzIndex>(U, mu);
    GaugeMat Unu = PeekIndex<LorentzIndex>(U, nu);
    if(M == N){
      GaugeMat F(Umu.Grid());
      CloverleafMxN(F, Umu, Unu, mu, nu, M, N);
      FS = 0.125 * ( F - adj(F) );
    }else{
      //Average over both orientations
      GaugeMat horizontal(Umu.Grid()), vertical(Umu.Grid());
      CloverleafMxN(horizontal, Umu, Unu, mu, nu, M, N);
      CloverleafMxN(vertical, Umu, Unu, mu, nu, N, M);
      FS = 0.0625 * ( horizontal - adj(horizontal) + vertical - adj(vertical) );
    }
  }
  //Topological charge contribution from MxN Wilson loops
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
  //output is the charge by timeslice: sum over timeslices to obtain the total
  static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
    assert(Nd == 4);
    std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
    //Note F_numu = - F_munu
    //hence we only need to loop over mu,nu,rho,sigma that aren't related by permuting mu,nu  or rho,sigma
    //Use nu > mu
    for(int mu=0;mu<Nd-1;mu++){
      for(int nu=mu+1; nu<Nd; nu++){
 	F[mu][nu] = new GaugeMat(U.Grid());
 	FieldStrengthMxN(*F[mu][nu], U, mu, nu, M, N);
      }
    }
    Real coeff = -1./(32 * M_PI*M_PI * M*M * N*N); //overall sign to match CPS and Grid conventions, possibly related to time direction = 3 vs 0
    static const int combs[3][4] = { {0,1,2,3}, {0,2,1,3}, {0,3,1,2} };
    static const int signs[3] = { 1, -1, 1 }; //epsilon_{mu nu rho sigma}
    ComplexField fsum(U.Grid());
    fsum = Zero();
    for(int c=0;c<3;c++){
      int mu = combs[c][0], nu = combs[c][1], rho = combs[c][2], sigma = combs[c][3];
      int eps = signs[c];
      fsum = fsum + (8. * coeff * eps) * trace( (*F[mu][nu]) * (*F[rho][sigma]) ); 
    }
    for(int mu=0;mu<Nd-1;mu++)
      for(int nu=mu+1; nu<Nd; nu++)
 	delete F[mu][nu];
    typedef typename ComplexField::scalar_object sobj;
    std::vector<sobj> Tq;
    sliceSum(fsum, Tq, Nd-1);
    std::vector<Real> out(Tq.size());
    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
    return out;
  }
  static Real TopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
    std::vector<Real> Tq = TimesliceTopologicalChargeMxN(U,M,N);
    Real out(0);
    for(int t=0;t<Tq.size();t++) out += Tq[t];
    return out;
  }
  //Generate the contributions to the 5Li topological charge from Wilson loops of the following sizes
  //Use coefficients from hep-lat/9701012
  //1x1 : c1=(19.-55.*c5)/9.
  //2x2 : c2=(1-64.*c5)/9.
  //1x2 : c3=(-64.+640.*c5)/45.
  //1x3 : c4=1./5.-2.*c5
  //3x3 : c5=1./20.
  //Output array outer index contains the loops in the above order
  //Inner index is the time coordinate
  static std::vector<std::vector<Real> > TimesliceTopologicalCharge5LiContributions(const GaugeLorentz &U){
    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };       
    std::vector<std::vector<Real> > out(5);
    for(int i=0;i<5;i++){	
      out[i] = TimesliceTopologicalChargeMxN(U,exts[i][0],exts[i][1]);
    }
    return out;
  }   
  static std::vector<Real> TopologicalCharge5LiContributions(const GaugeLorentz &U){   
    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
    std::vector<Real> out(5);
    std::cout << GridLogMessage << "Computing topological charge" << std::endl;
    for(int i=0;i<5;i++){
      out[i] = TopologicalChargeMxN(U,exts[i][0],exts[i][1]);
      std::cout << GridLogMessage << exts[i][0] << "x" << exts[i][1] << " Wilson loop contribution " << out[i] << std::endl;
    }
    return out;
  }
  //Compute the 5Li topological charge
  static std::vector<Real> TimesliceTopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
    double c5=1./20.;
    double c4=1./5.-2.*c5;
    double c3=(-64.+640.*c5)/45.;
    double c2=(1-64.*c5)/9.;
    double c1=(19.-55.*c5)/9.;
    int Lt = loops[0].size();
    std::vector<Real> out(Lt,0.);
    for(int t=0;t<Lt;t++)
      out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
    return out;
  }
  static Real TopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
    Real Q = 0.;
    for(int t=0;t<Qt.size();t++) Q += Qt[t];
    std::cout << GridLogMessage << "5Li Topological charge: " << Q << std::endl;
    return Q;
  }
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
--- a/Grid/sitmo_rng/README
+++ b/Grid/sitmo_rng/README
--- a/Grid/random/gaussian.h
+++ b/Grid/random/gaussian.h
@@ -0,0 +1,200 @@
 // -*- C++ -*-
 //===--------------------------- random -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // Peter Boyle: Taken from libc++ in Clang/LLVM.
 // Reason is that libstdc++ and clang differ in their return order in the normal_distribution / box mueller type step.
 // standardise on one and call it "gaussian_distribution".
 #pragma once
 #include <cstddef>
 #include <cstdint>
 #include <cmath>
 #include <type_traits>
 #include <initializer_list>
 #include <limits>
 #include <algorithm>
 #include <numeric>
 #include <vector>
 #include <string>
 #include <istream>
 #include <ostream>
 #include <random>
 // normal_distribution -> gaussian distribution
 namespace Grid {
 template<class _RealType = double>
 class  gaussian_distribution
 {
 public:
    // types
    typedef _RealType result_type;
    class param_type
    {
        result_type __mean_;
        result_type __stddev_;
    public:
        typedef gaussian_distribution distribution_type;
        strong_inline
        explicit param_type(result_type __mean = 0, result_type __stddev = 1)
            : __mean_(__mean), __stddev_(__stddev) {}
        strong_inline
        result_type mean() const {return __mean_;}
        strong_inline
        result_type stddev() const {return __stddev_;}
        friend strong_inline
            bool operator==(const param_type& __x, const param_type& __y)
            {return __x.__mean_ == __y.__mean_ && __x.__stddev_ == __y.__stddev_;}
        friend strong_inline
            bool operator!=(const param_type& __x, const param_type& __y)
            {return !(__x == __y);}
    };
 private:
    param_type __p_;
    result_type _V_;
    bool _V_hot_;
 public:
    // constructors and reset functions
    strong_inline
    explicit gaussian_distribution(result_type __mean = 0, result_type __stddev = 1)
        : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
    strong_inline
    explicit gaussian_distribution(const param_type& __p)
        : __p_(__p), _V_hot_(false) {}
    strong_inline
    void reset() {_V_hot_ = false;}
    // generating functions
    template<class _URNG>
        strong_inline
        result_type operator()(_URNG& __g)
        {return (*this)(__g, __p_);}
    template<class _URNG> result_type operator()(_URNG& __g, const param_type& __p);
    // property functions
    strong_inline
    result_type mean() const {return __p_.mean();}
    strong_inline
    result_type stddev() const {return __p_.stddev();}
    strong_inline
    param_type param() const {return __p_;}
    strong_inline
    void param(const param_type& __p) {__p_ = __p;}
    strong_inline
    result_type min() const {return -std::numeric_limits<result_type>::infinity();}
    strong_inline
    result_type max() const {return std::numeric_limits<result_type>::infinity();}
    friend strong_inline
        bool operator==(const gaussian_distribution& __x,
                        const gaussian_distribution& __y)
        {return __x.__p_ == __y.__p_ && __x._V_hot_ == __y._V_hot_ &&
                (!__x._V_hot_ || __x._V_ == __y._V_);}
    friend strong_inline
        bool operator!=(const gaussian_distribution& __x,
                        const gaussian_distribution& __y)
        {return !(__x == __y);}
    template <class _CharT, class _Traits, class _RT>
    friend
    std::basic_ostream<_CharT, _Traits>&
    operator<<(std::basic_ostream<_CharT, _Traits>& __os,
               const gaussian_distribution<_RT>& __x);
    template <class _CharT, class _Traits, class _RT>
    friend
    std::basic_istream<_CharT, _Traits>&
    operator>>(std::basic_istream<_CharT, _Traits>& __is,
               gaussian_distribution<_RT>& __x);
 };
 template <class _RealType>
 template<class _URNG>
 _RealType
 gaussian_distribution<_RealType>::operator()(_URNG& __g, const param_type& __p)
 {
    result_type _Up;
    if (_V_hot_)
    {
        _V_hot_ = false;
        _Up = _V_;
    }
    else
    {
        std::uniform_real_distribution<result_type> _Uni(-1, 1);
        result_type __u;
        result_type __v;
        result_type __s;
        do
        {
            __u = _Uni(__g);
            __v = _Uni(__g);
            __s = __u * __u + __v * __v;
        } while (__s > 1 || __s == 0);
        result_type _Fp = std::sqrt(-2 * std::log(__s) / __s);
        _V_ = __v * _Fp;
        _V_hot_ = true;
        _Up = __u * _Fp;
    }
    return _Up * __p.stddev() + __p.mean();
 }
 template <class _CharT, class _Traits, class _RT>
 std::basic_ostream<_CharT, _Traits>&
 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
           const gaussian_distribution<_RT>& __x)
 {
    auto __save_flags = __os.flags();
    __os.flags(std::ios_base::dec | std::ios_base::left | std::ios_base::fixed |
               std::ios_base::scientific);
    _CharT __sp = __os.widen(' ');
    __os.fill(__sp);
    __os << __x.mean() << __sp << __x.stddev() << __sp << __x._V_hot_;
    if (__x._V_hot_)
        __os << __sp << __x._V_;
    __os.flags(__save_flags);
    return __os;
 }
 template <class _CharT, class _Traits, class _RT>
 std::basic_istream<_CharT, _Traits>&
 operator>>(std::basic_istream<_CharT, _Traits>& __is,
           gaussian_distribution<_RT>& __x)
 {
    typedef gaussian_distribution<_RT> _Eng;
    typedef typename _Eng::result_type result_type;
    typedef typename _Eng::param_type param_type;
    auto __save_flags = __is.flags();
    __is.flags(std::ios_base::dec | std::ios_base::skipws);
    result_type __mean;
    result_type __stddev;
    result_type _Vp = 0;
    bool _V_hot = false;
    __is >> __mean >> __stddev >> _V_hot;
    if (_V_hot)
        __is >> _Vp;
    if (!__is.fail())
    {
        __x.param(param_type(__mean, __stddev));
        __x._V_hot_ = _V_hot;
        __x._V_ = _Vp;
    }
    __is.flags(__save_flags);
    return __is;
 }
 }
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -322,8 +322,8 @@ public:
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
-    int recv_from_rank;
+    //    int recv_from_rank;
-    int xmit_to_rank;
+    //    int xmit_to_rank;
    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -208,5 +208,46 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 }
 //////////////////////////////////////////////////////////////////////////////////
 //Copy a single lane of a SIMD tensor type from one object to another
 //Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
 ///////////////////////////////////////////////////////////////////////////////////
 template<class vobjOut, class vobjIn>
 accelerator_inline 
 void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
 {
  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  typedef typename vobjOut::vector_type ovector_type;  
  typedef typename vobjIn::vector_type ivector_type;  
  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
  typedef typename vobjOut::scalar_type oscalar_type;  
  typedef typename vobjIn::scalar_type iscalar_type;  
  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
  typedef oextract_type * opointer;
  typedef iextract_type * ipointer;
  constexpr int oNsimd=ovector_type::Nsimd();
  constexpr int iNsimd=ivector_type::Nsimd();
  iscalar_type itmp;
  oscalar_type otmp;
  opointer __restrict__  op = (opointer)&vecOut;
  ipointer __restrict__  ip = (ipointer)&vecIn;
  for(int w=0;w<owords;w++){
    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
    otmp = itmp; //potential precision change
    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
  }
 }
 NAMESPACE_END(Grid);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }
-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }
-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -74,29 +74,43 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_DEFAULT_GPU
  int device = 0;
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
-    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: assume user either uses\n");
    printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
 #else
  int device = rank;
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
  printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
  cudaSetDevice(rank);
 #endif
  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
    cudaDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -154,16 +168,25 @@ void acceleratorInit(void)
 #ifdef GRID_DEFAULT_GPU
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: using default device \n");
-    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
+    printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
-    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
  }
  int device = 0;
 #else
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
-    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
  }
-  hipSetDevice(rank);
+  int device = rank;
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
    hipDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 }
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 #ifdef GRID_CUDA
 #include <cuda.h>
 #ifdef __CUDA_ARCH__
@@ -115,6 +116,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific
 inline void cuda_mem(void)
 {
  size_t free_t,total_t,used_t;
  cudaMemGetInfo(&free_t,&total_t);
  used_t=total_t-free_t;
  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
 }
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@@ -197,7 +206,8 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
  return ptr;
 };
@@ -207,20 +217,53 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
  return ptr;
 };
-inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
+inline void acceleratorFreeShared(void *ptr){
-inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
+  auto err = cudaFree(ptr);
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
+  if( err != cudaSuccess ) {
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+    printf(" cudaFree(Shared) failed %s \n",cudaGetErrorString(err)); fflush(stdout);
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 };
 inline void acceleratorFreeDevice(void *ptr){
  auto err = cudaFree(ptr);
  if( err != cudaSuccess ) {
    printf(" cudaFree(Device) failed %s \n",cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 };
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  {
  auto err = cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);
  if( err != cudaSuccess ) {
    printf(" cudaMemcpy(host->device) failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 }
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){
  auto err = cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);
  if( err != cudaSuccess ) {
    printf(" cudaMemcpy(device->host) failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) {
  auto err = cudaMemset(base,value,bytes);
  if( err != cudaSuccess ) {
    printf(" cudaMemSet failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
 }
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@@ -297,7 +340,7 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {
  theGridAccelerator->memcpy(to,from,bytes);
 }
-inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
@@ -328,10 +371,11 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline
 extern hipStream_t copyStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-  return hipThreadIdx_z; 
+  return hipThreadIdx_x; 
 #else
  return 0;
 #endif
@@ -345,19 +389,41 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      { __VA_ARGS__;}							\
    };									\
    int nt=acceleratorThreads();					\
-    dim3 hip_threads(nt,1,nsimd);					\
+    dim3 hip_threads(nsimd, nt, 1);					 \
-    dim3 hip_blocks ((num1+nt-1)/nt,num2,1);				\
+    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
-    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
+    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
-		       0,0,						\
+      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-		       num1,num2,nsimd,lambda);				\
+            0,0,						\
            num1,num2,nsimd, lambda);				\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
            0,0,						\
            num1,num2,nsimd, lambda);				\
    } \
  }
 template<typename lambda>  __global__
 __launch_bounds__(64,1)
 void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
  // Following the same scheme as CUDA for now
  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
  uint64_t z = threadIdx.x;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
 }
 template<typename lambda>  __global__
 __launch_bounds__(1024,1)
 void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
-  uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
+  // Following the same scheme as CUDA for now
-  uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
-  uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
  uint64_t z = threadIdx.x;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
@@ -402,10 +468,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-inline void acceleratorCopySynchronise(void) {  }
+//inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
 #endif
 //////////////////////////////////////////////
@@ -476,18 +548,12 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
-accelerator_inline void acceleratorSynchronise(void) 
+accelerator_inline void acceleratorSynchronise(void)  // Only Nvidia needs 
 {
 #ifdef GRID_SIMT
 #ifdef GRID_CUDA
  __syncwarp();
 #endif
 #ifdef GRID_SYCL
  //cl::sycl::detail::workGroupBarrier();
 #endif
 #ifdef GRID_HIP
  __syncthreads();
 #endif
 #endif
  return;
 }
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -88,7 +88,7 @@ public:
 // Coordinate class, maxdims = 8 for now.
 ////////////////////////////////////////////////////////////////
 #define GRID_MAX_LATTICE_DIMENSION (8)
-#define GRID_MAX_SIMD              (16)
+#define GRID_MAX_SIMD              (32)
 static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
  return;
 }
 void GridCmdOptionFloat(std::string &str,float & val)
 {
  std::stringstream ss(str);
  ss>>val;
  return;
 }
 void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
@@ -527,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);
 void GridCmdOptionFloat(std::string &str,float & val);
 void GridParseLayout(char **argv,int argc,
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -0,0 +1,473 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/DWF2p1fIwasakiGparity.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //2+1f DWF+I ensemble with G-parity BCs
 //designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  RatQuoParameters, rat_quo_l,
 				  RatQuoParameters, rat_quo_s);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr){
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
 }
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "Params", user_params);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
   // Typedefs to simplify notation
  typedef GparityDomainWallFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityDomainWallFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.032;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  //Setup the Grids
  auto GridPtrD   = TheHMC.Resources.GetCartesian();
  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(GridPtrD);
  LatticeGaugeFieldF Uf(GridPtrF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light action
  /////////////////////////////////////////////////////////////
  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_l;
  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_l.precision= 60;
  rat_act_params_l.MaxIter  = 10000;
  user_params.rat_quo_l.Export(rat_act_params_l);
  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
  //DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
  Level1.push_back(&Quotient_l);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  //DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
  std::string lanc_params_l, lanc_params_s;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
    else if(sarg == "--eigenrange_l"){
      assert(i < argc-1);
      eigenrange_l=true;
      lanc_params_l = argv[i+1];
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
  }
  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
+++ b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
@@ -0,0 +1,473 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/DWF2p1fIwasakiGparity.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //2+1f DWF+I ensemble with G-parity BCs
 //designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  RatQuoParameters, rat_quo_l,
 				  RatQuoParameters, rat_quo_s);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr){
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
 }
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "Params", user_params);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
   // Typedefs to simplify notation
  typedef GparityDomainWallFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityDomainWallFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.032;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  //Setup the Grids
  auto GridPtrD   = TheHMC.Resources.GetCartesian();
  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(GridPtrD);
  LatticeGaugeFieldF Uf(GridPtrF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light action
  /////////////////////////////////////////////////////////////
  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_l;
  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_l.precision= 60;
  rat_act_params_l.MaxIter  = 10000;
  user_params.rat_quo_l.Export(rat_act_params_l);
  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
  DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
  Level1.push_back(&Quotient_l);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
  std::string lanc_params_l, lanc_params_s;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
    else if(sarg == "--eigenrange_l"){
      assert(i < argc-1);
      eigenrange_l=true;
      lanc_params_l = argv[i+1];
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
  }
  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA.cc
@@ -0,0 +1,765 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //We try to reproduce with G-parity BCs the 246 MeV 1.37 GeV ensemble
 //To speed things up we will use Mobius DWF with b+c=32/12 and Ls=12 to match the Ls=32 of the original
 //These parameters match those used in the 2020 K->pipi paper
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EOFAparameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
 				  OneFlavourRationalParams, rat_params,
 				  double, action_tolerance,
 				  double, action_mixcg_inner_tolerance,
 				  double, md_tolerance,
 				  double, md_mixcg_inner_tolerance);
  EOFAparameters() { 
    action_mixcg_inner_tolerance = 1e-8;
    action_tolerance = 1e-10;
    md_tolerance = 1e-8;
    md_mixcg_inner_tolerance = 1e-8;
    rat_params.lo = 0.1;
    rat_params.hi = 25.0;
    rat_params.MaxIter  = 10000;
    rat_params.tolerance= 1.0e-9;
    rat_params.degree   = 14;
    rat_params.precision= 50;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  EOFAparameters, eofa_l,
 				  RatQuoParameters, rat_quo_s,
 				  RatQuoParameters, rat_quo_DSDR);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 //action_or_md toggles checking the action (0), MD (1) or both (2) setups
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  PowerMethod<FermionFieldD> power_method;
  RealD lambda_max;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
  lambda_max = power_method(MdagM,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
  lambda_max = power_method(VdagV,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  if(action_or_md == 0 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  if(action_or_md == 1 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
 }
 template<typename FermionImplPolicy>
 void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
  typename FermionImplPolicy::FermionField eta(FGrid);
  RealD scale = std::sqrt(0.5);
  gaussian(rng,eta); eta = eta * scale;
  //Use the inbuilt check
  EOFA.refresh(latt, eta);
  EOFA.S(latt);
  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
 }
 template<typename FermionImplPolicy>
 class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
 }
 //Applications of M^{-1} cost the same as M for EOFA!
 template<typename FermionImplPolicy>
 class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
 }
 NAMESPACE_BEGIN(Grid);
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      MPCG.InnerTolerance = InnerTolerance;
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    {
      Grid::XmlWriter wr(param_file + ".templ");
      write(wr, "Params", user_params);
    }
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
  typedef GparityMobiusEOFAFermionD EOFAactionD;
  typedef GparityMobiusFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityMobiusEOFAFermionF EOFAactionF;
  typedef GparityMobiusFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 1.75;
  Real light_mass   = 0.0042; //240 MeV
  Real strange_mass = 0.045;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD mobius_scale = 32./12.; //b+c
  RealD mob_bmc = 1.0;
  RealD mob_b = (mobius_scale + mob_bmc)/2.;
  RealD mob_c = (mobius_scale - mob_bmc)/2.;
  //Setup the Grids
  auto UGridD   = TheHMC.Resources.GetCartesian();
  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(UGridD);
  LatticeGaugeFieldF Uf(UGridF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(1); //DSDR
  ActionLevel<HMCWrapper::Field> Level3(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light EOFA action
  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
  /////////////////////////////////////////////////////////////
  EOFAactionD LopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, light_mass, light_mass, pv_mass, 0.0, -1, M5, mob_b, mob_c, Params);
  EOFAactionF LopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, light_mass, light_mass, pv_mass, 0.0, -1, M5, mob_b, mob_c, Params);
  EOFAactionD RopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, pv_mass, light_mass, pv_mass, -1.0, 1, M5, mob_b, mob_c, Params);
  EOFAactionF RopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, pv_mass, light_mass, pv_mass, -1.0, 1, M5, mob_b, mob_c, Params);
  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
  EOFAschuropD linopL_D(LopD);
  EOFAschuropD linopR_D(RopD);
  EOFAschuropF linopL_F(LopF);
  EOFAschuropF linopR_F(RopF);
  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
  EOFA_mxCG ActionMCG_L(user_params.eofa_l.action_tolerance, 10000, 1000, UGridF, FrbGridF, LopF, LopD, linopL_F, linopL_D);
  ActionMCG_L.InnerTolerance = user_params.eofa_l.action_mixcg_inner_tolerance;
  EOFA_mxCG ActionMCG_R(user_params.eofa_l.action_tolerance, 10000, 1000, UGridF, FrbGridF, RopF, RopD, linopR_F, linopR_D);
  ActionMCG_R.InnerTolerance = user_params.eofa_l.action_mixcg_inner_tolerance;
  EOFA_mxCG DerivMCG_L(user_params.eofa_l.md_tolerance, 10000, 1000, UGridF, FrbGridF, LopF, LopD, linopL_F, linopL_D);
  DerivMCG_L.InnerTolerance = user_params.eofa_l.md_mixcg_inner_tolerance;
  EOFA_mxCG DerivMCG_R(user_params.eofa_l.md_tolerance, 10000, 1000, UGridF, FrbGridF, RopF, RopD, linopR_F, linopR_D);
  DerivMCG_R.InnerTolerance = user_params.eofa_l.md_mixcg_inner_tolerance;
  std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
  std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
  ConjugateGradient<FermionFieldD>      ActionCG(user_params.eofa_l.action_tolerance, 10000);
  ConjugateGradient<FermionFieldD>  DerivativeCG(user_params.eofa_l.md_tolerance, 10000);
  // ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicyD> EOFA(LopD, RopD, 
  // 								   ActionCG, ActionCG, ActionCG, 
  // 								   DerivativeCG, DerivativeCG, 
  // 								   user_params.eofa_l.rat_params, true);
  // ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicyD> EOFA(LopD, RopD, 
  // 								   ActionMCG_L, ActionMCG_R, 
  // 								   ActionMCG_L, ActionMCG_R, 
  // 								   DerivMCG_L, DerivMCG_R, 
  // 								   user_params.eofa_l.rat_params, true);
  ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFA(LopF, RopF,
 													LopD, RopD, 
 													ActionMCG_L, ActionMCG_R, 
 													ActionMCG_L, ActionMCG_R, 
 													DerivMCG_L, DerivMCG_R, 
 													user_params.eofa_l.rat_params, true);
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  ///////////////////////////////////
  // DSDR action
  ///////////////////////////////////
  RealD dsdr_mass=-1.8;   
  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
  RealD dsdr_epsilon_b = 0.5; 
  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
  RationalActionParams rat_act_params_DSDR;
  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_DSDR.precision= 60;
  rat_act_params_DSDR.MaxIter  = 10000;
  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
  Level2.push_back(&Quotient_DSDR);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool 
    tune_rhmc_s=false, eigenrange_s=false, 
    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
    check_eofa=false, 
    upper_bound_eofa=false, lower_bound_eofa(false);
  std::string lanc_params_s;
  std::string lanc_params_DSDR;
  int tune_rhmc_s_action_or_md;
  int tune_rhmc_DSDR_action_or_md;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_s"){
      assert(i < argc-1);
      tune_rhmc_s=true;
      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
    else if(sarg == "--tune_rhmc_DSDR"){
      assert(i < argc-1);
      tune_rhmc_DSDR=true;
      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_DSDR"){
      assert(i < argc-1);
      eigenrange_DSDR=true;
      lanc_params_DSDR = argv[i+1];
    }
    else if(sarg == "--check_eofa") check_eofa = true;
    else if(sarg == "--upper_bound_eofa") upper_bound_eofa = true;
    else if(sarg == "--lower_bound_eofa") lower_bound_eofa = true;
  }
  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
    std::cout << GridLogMessage << "Running checks" << std::endl;
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
    std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
    if(check_eofa) checkEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(upper_bound_eofa) upperBoundEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(lower_bound_eofa) lowerBoundEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
@@ -0,0 +1,918 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //Production binary for the 40ID G-parity ensemble
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EOFAparameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
 				  OneFlavourRationalParams, rat_params,
 				  double, action_tolerance,
 				  double, action_mixcg_inner_tolerance,
 				  double, md_tolerance,
 				  double, md_mixcg_inner_tolerance);
  EOFAparameters() { 
    action_mixcg_inner_tolerance = 1e-8;
    action_tolerance = 1e-10;
    md_tolerance = 1e-8;
    md_mixcg_inner_tolerance = 1e-8;
    rat_params.lo = 1.0;
    rat_params.hi = 25.0;
    rat_params.MaxIter  = 50000;
    rat_params.tolerance= 1.0e-9;
    rat_params.degree   = 14;
    rat_params.precision= 50;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
 				  RealD, TrajectoryLength,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  std::vector<EOFAparameters>, eofa_l,
 				  RatQuoParameters, rat_quo_s,
 				  RatQuoParameters, rat_quo_DSDR);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
    TrajectoryLength = 1.0;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 50000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 //action_or_md toggles checking the action (0), MD (1) or both (2) setups
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  PowerMethod<FermionFieldD> power_method;
  RealD lambda_max;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
  lambda_max = power_method(MdagM,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
  lambda_max = power_method(VdagV,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  if(action_or_md == 0 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  if(action_or_md == 1 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
 }
 template<typename FermionImplPolicy>
 void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
  typename FermionImplPolicy::FermionField eta(FGrid);
  RealD scale = std::sqrt(0.5);
  gaussian(rng,eta); eta = eta * scale;
  //Use the inbuilt check
  EOFA.refresh(latt, eta);
  EOFA.S(latt);
  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
 }
 template<typename FermionImplPolicy>
 class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
 }
 //Applications of M^{-1} cost the same as M for EOFA!
 template<typename FermionImplPolicy>
 class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
 }
 NAMESPACE_BEGIN(Grid);
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      MPCG.InnerTolerance = InnerTolerance;
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD Tolerance;
    Integer MaxIterations;
    RealD Delta; //reliable update parameter
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
 								  RealD delta,
 								  Integer maxit, 
 								  GridBase* _sp_grid4, 
 								  GridBase* _sp_grid5, 
 								  FermionOperatorF &_FermOpF,
 								  FermionOperatorD &_FermOpD,
 								  SchurOperatorF   &_LinOpF,
 								  SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      Delta(delta),
      MaxIterations(maxit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5)
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  std::string serial_seeds = "1 2 3 4 5";
  std::string parallel_seeds = "6 7 8 9 10";
  int i=1;
  while(i < argc){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
      i+=2;
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
      i++;
    }else if(sarg == "--set_seeds"){ //set the rng seeds. Expects two vector args, e.g.  --set_seeds 1.2.3.4 5.6.7.8
      assert(i < argc-2);
      std::vector<int> tmp;
      GridCmdOptionIntVector(argv[i+1],tmp);
      {
 	std::stringstream ss;
 	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
 	ss << tmp.back();
 	serial_seeds = ss.str();
      }
      GridCmdOptionIntVector(argv[i+2],tmp);
      {
 	std::stringstream ss;
 	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
 	ss << tmp.back();
 	parallel_seeds = ss.str();
      }
      i+=3;
      std::cout << GridLogMessage << "Set serial seeds to " << serial_seeds << std::endl;
      std::cout << GridLogMessage << "Set parallel seeds to " << parallel_seeds << std::endl;
    }else{
      i++;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    {
      Grid::XmlWriter wr(param_file + ".templ");
      write(wr, "Params", user_params);
    }
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
  typedef GparityMobiusEOFAFermionD EOFAactionD;
  typedef GparityMobiusFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityMobiusEOFAFermionF EOFAactionF;
  typedef GparityMobiusFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  MD.name    = std::string("MinimumNorm2");
  // typedef ConjugateHMCRunnerD<ForceGradient> HMCWrapper;
  // MD.name    = std::string("ForceGradient");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = user_params.TrajectoryLength;
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = serial_seeds;
  RNGpar.parallel_seeds = parallel_seeds;
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  //aiming for ainv=1.723 GeV
  //                                  me         bob
  //Estimated  a(ml+mres) [40ID] = 0.001305    0.00131
  //           a(mh+mres) [40ID] = 0.035910    0.03529
  //Estimate Ls=12, b+c=2  mres~0.0011
  //1/24/2022 initial mres measurement gives mres=0.001,  adjusted light quark mass to 0.0003 from 0.0001
  const int Ls      = 12;
  Real beta         = 1.848;
  Real light_mass   = 0.0003;
  Real strange_mass = 0.0342;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD mobius_scale = 2.; //b+c
  RealD mob_bmc = 1.0;
  RealD mob_b = (mobius_scale + mob_bmc)/2.;
  RealD mob_c = (mobius_scale - mob_bmc)/2.;
  std::cout << GridLogMessage
 	    << "Ensemble parameters:" << std::endl
 	    << "Ls=" << Ls << std::endl
 	    << "beta=" << beta << std::endl
 	    << "light_mass=" << light_mass << std::endl
 	    << "strange_mass=" << strange_mass << std::endl
 	    << "mobius_scale=" << mobius_scale << std::endl;
  //Setup the Grids
  auto UGridD   = TheHMC.Resources.GetCartesian();
  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(UGridD);
  LatticeGaugeFieldF Uf(UGridF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
  /////////////////////////////////////////////////////////////
  // Light EOFA action
  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
  /////////////////////////////////////////////////////////////
  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
  int n_light_hsb = 5;
  assert(user_params.eofa_l.size() == n_light_hsb);
  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
  for(int i=0;i<n_light_hsb;i++){
    RealD iml = eofa_light_masses[i];
    RealD ipv = eofa_pv_masses[i];
    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
 #if 1
    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
 #else
    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
 #endif
    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
 							*LopD, *RopD, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*DerivMCG_L, *DerivMCG_R, 
 							user_params.eofa_l[i].rat_params, true);
    EOFA_pfactions[i] = EOFA;
    Level1.push_back(EOFA);
  }
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 50000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  ///////////////////////////////////
  // DSDR action
  ///////////////////////////////////
  RealD dsdr_mass=-1.8;   
  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
  RealD dsdr_epsilon_b = 0.5; 
  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
  RationalActionParams rat_act_params_DSDR;
  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_DSDR.precision= 60;
  rat_act_params_DSDR.MaxIter  = 50000;
  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
  Level2.push_back(&Quotient_DSDR);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool 
    tune_rhmc_s=false, eigenrange_s=false, 
    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
    check_eofa=false, 
    upper_bound_eofa=false, lower_bound_eofa(false);
  std::string lanc_params_s;
  std::string lanc_params_DSDR;
  int tune_rhmc_s_action_or_md;
  int tune_rhmc_DSDR_action_or_md;
  int eofa_which_hsb;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_s"){
      assert(i < argc-1);
      tune_rhmc_s=true;
      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
    else if(sarg == "--tune_rhmc_DSDR"){
      assert(i < argc-1);
      tune_rhmc_DSDR=true;
      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_DSDR"){
      assert(i < argc-1);
      eigenrange_DSDR=true;
      lanc_params_DSDR = argv[i+1];
    }
    else if(sarg == "--check_eofa"){
      assert(i < argc-1);
      check_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
    }
    else if(sarg == "--upper_bound_eofa"){
      assert(i < argc-1);
      upper_bound_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
    else if(sarg == "--lower_bound_eofa"){
      assert(i < argc-1);
      lower_bound_eofa = true;      
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
  }
  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
    std::cout << GridLogMessage << "Running checks" << std::endl;
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
    if(check_eofa){
      if(eofa_which_hsb >= 0){
 	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
 	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
      }else{
 	for(int i=0;i<n_light_hsb;i++){
 	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
 	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
 	}
      }
    }	  
    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
@@ -0,0 +1,873 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //Production binary for the 40ID G-parity ensemble
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EOFAparameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
 				  OneFlavourRationalParams, rat_params,
 				  double, action_tolerance,
 				  double, action_mixcg_inner_tolerance,
 				  double, md_tolerance,
 				  double, md_mixcg_inner_tolerance);
  EOFAparameters() { 
    action_mixcg_inner_tolerance = 1e-8;
    action_tolerance = 1e-10;
    md_tolerance = 1e-8;
    md_mixcg_inner_tolerance = 1e-8;
    rat_params.lo = 1.0;
    rat_params.hi = 25.0;
    rat_params.MaxIter  = 10000;
    rat_params.tolerance= 1.0e-9;
    rat_params.degree   = 14;
    rat_params.precision= 50;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
 				  RealD, TrajectoryLength,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  std::vector<EOFAparameters>, eofa_l,
 				  RatQuoParameters, rat_quo_s,
 				  RatQuoParameters, rat_quo_DSDR);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
    TrajectoryLength = 1.0;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 //action_or_md toggles checking the action (0), MD (1) or both (2) setups
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  PowerMethod<FermionFieldD> power_method;
  RealD lambda_max;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
  lambda_max = power_method(MdagM,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
  lambda_max = power_method(VdagV,gauss_o);
  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
  if(action_or_md == 0 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  if(action_or_md == 1 || action_or_md == 2){
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  }
 }
 template<typename FermionImplPolicy>
 void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
  typename FermionImplPolicy::FermionField eta(FGrid);
  RealD scale = std::sqrt(0.5);
  gaussian(rng,eta); eta = eta * scale;
  //Use the inbuilt check
  EOFA.refresh(latt, eta);
  EOFA.S(latt);
  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
 }
 template<typename FermionImplPolicy>
 class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
 }
 //Applications of M^{-1} cost the same as M for EOFA!
 template<typename FermionImplPolicy>
 class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
  LatticeGaugeFieldD &U;
 public:
  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
  typedef typename FermionImplPolicy::FermionField Field;
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 template<typename FermionImplPolicy>
 void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
 		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
  typename FermionImplPolicy::FermionField eta(FGrid);
  gaussian(rng,eta);
  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
  auto lambda_max = power_method(linop,eta);
  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
 }
 NAMESPACE_BEGIN(Grid);
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      MPCG.InnerTolerance = InnerTolerance;
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD Tolerance;
    Integer MaxIterations;
    RealD Delta; //reliable update parameter
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
 								  RealD delta,
 								  Integer maxit, 
 								  GridBase* _sp_grid4, 
 								  GridBase* _sp_grid5, 
 								  FermionOperatorF &_FermOpF,
 								  FermionOperatorD &_FermOpD,
 								  SchurOperatorF   &_LinOpF,
 								  SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      Delta(delta),
      MaxIterations(maxit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5)
    { 
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      precisionChange(FermOpF.Umu, FermOpD.Umu);
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    {
      Grid::XmlWriter wr(param_file + ".templ");
      write(wr, "Params", user_params);
    }
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
  typedef GparityMobiusEOFAFermionD EOFAactionD;
  typedef GparityMobiusFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityMobiusEOFAFermionF EOFAactionF;
  typedef GparityMobiusFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = user_params.TrajectoryLength;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  //aiming for ainv=2.068             me          Bob
  //Estimated  a(ml+mres) [48ID] = 0.001048    0.00104 
  //           a(mh+mres) [48ID] = 0.028847    0.02805
  //Estimate Ls=12, b+c=2  mres~0.0003
  const int Ls      = 12;
  Real beta         = 1.946;
  Real light_mass   = 0.00074;   //0.00104 - mres_approx;
  Real strange_mass = 0.02775;    //0.02805 - mres_approx
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD mobius_scale = 2.; //b+c
  RealD mob_bmc = 1.0;
  RealD mob_b = (mobius_scale + mob_bmc)/2.;
  RealD mob_c = (mobius_scale - mob_bmc)/2.;
  //Setup the Grids
  auto UGridD   = TheHMC.Resources.GetCartesian();
  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(UGridD);
  LatticeGaugeFieldF Uf(UGridF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
  /////////////////////////////////////////////////////////////
  // Light EOFA action
  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
  /////////////////////////////////////////////////////////////
  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
  int n_light_hsb = 5;
  assert(user_params.eofa_l.size() == n_light_hsb);
  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
  for(int i=0;i<n_light_hsb;i++){
    RealD iml = eofa_light_masses[i];
    RealD ipv = eofa_pv_masses[i];
    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
 #if 1
    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
 #else
    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
 #endif
    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
 							*LopD, *RopD, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*ActionMCG_L, *ActionMCG_R, 
 							*DerivMCG_L, *DerivMCG_R, 
 							user_params.eofa_l[i].rat_params, true);
    EOFA_pfactions[i] = EOFA;
    Level1.push_back(EOFA);
  }
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  ///////////////////////////////////
  // DSDR action
  ///////////////////////////////////
  RealD dsdr_mass=-1.8;   
  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
  RealD dsdr_epsilon_b = 0.5; 
  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
  RationalActionParams rat_act_params_DSDR;
  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_DSDR.precision= 60;
  rat_act_params_DSDR.MaxIter  = 10000;
  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
  Level2.push_back(&Quotient_DSDR);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool 
    tune_rhmc_s=false, eigenrange_s=false, 
    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
    check_eofa=false, 
    upper_bound_eofa=false, lower_bound_eofa(false);
  std::string lanc_params_s;
  std::string lanc_params_DSDR;
  int tune_rhmc_s_action_or_md;
  int tune_rhmc_DSDR_action_or_md;
  int eofa_which_hsb;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_s"){
      assert(i < argc-1);
      tune_rhmc_s=true;
      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
    else if(sarg == "--tune_rhmc_DSDR"){
      assert(i < argc-1);
      tune_rhmc_DSDR=true;
      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
    }
    else if(sarg == "--eigenrange_DSDR"){
      assert(i < argc-1);
      eigenrange_DSDR=true;
      lanc_params_DSDR = argv[i+1];
    }
    else if(sarg == "--check_eofa"){
      assert(i < argc-1);
      check_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
    }
    else if(sarg == "--upper_bound_eofa"){
      assert(i < argc-1);
      upper_bound_eofa = true;
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
    else if(sarg == "--lower_bound_eofa"){
      assert(i < argc-1);
      lower_bound_eofa = true;      
      eofa_which_hsb = std::stoi(argv[i+1]);
      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
    }
  }
  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
    std::cout << GridLogMessage << "Running checks" << std::endl;
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
    if(check_eofa){
      if(eofa_which_hsb >= 0){
 	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
 	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
      }else{
 	for(int i=0;i<n_light_hsb;i++){
 	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
 	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
 	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
 	}
      }
    }	  
    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@@ -137,7 +137,7 @@ int main (int argc, char ** argv)
  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
-  double          n = BENCH_IO_NPASS;
+  //  double          n = BENCH_IO_NPASS;
  stats(mean, stdDev, perf);
  stats(avMean, avStdDev, avPerf);
@@ -164,7 +164,7 @@ int main (int argc, char ** argv)
                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s\n",
              "L", "std read", "std write", "Grid read", "Grid write");
@@ -185,7 +185,7 @@ int main (int argc, char ** argv)
              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
  MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%12s %12s %12s %12s\n",
              "std read", "std write", "Grid read", "Grid write");
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -142,7 +142,7 @@ public:
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
-	int ncomm;
+	//	int ncomm;
 	double dbytes;
        for(int dir=0;dir<8;dir++) {
@@ -290,7 +290,7 @@ public:
      LatticeSU4 z(&Grid); z=Zero();
      LatticeSU4 x(&Grid); x=Zero();
      LatticeSU4 y(&Grid); y=Zero();
-      double a=2.0;
+      //      double a=2.0;
      uint64_t Nloop=NLOOP;
--- a/benchmarks/Benchmark_comms_host_device.cc
+++ b/benchmarks/Benchmark_comms_host_device.cc
@@ -72,7 +72,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
-  time_statistics timestat;
+  //  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -126,19 +126,10 @@ int main (int argc, char ** argv)
  // Naive wilson implementation
  ////////////////////////////////////
  // replicate across fifth dimension
-  LatticeGaugeFieldF Umu5d(FGrid);
+  //  LatticeGaugeFieldF Umu5d(FGrid);
-  std::vector<LatticeColourMatrixF> U(4,FGrid);
+  std::vector<LatticeColourMatrixF> U(4,UGrid);
  {
    autoView( Umu5d_v, Umu5d, CpuWrite);
    autoView( Umu_v  , Umu  , CpuRead);
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
      for(int s=0;s<Ls;s++){
 	Umu5d_v[Ls*ss+s] = Umu_v[ss];
      }
    }
  }
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
@@ -147,10 +138,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 	  }
 	}
      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@@ -182,7 +191,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =3000;
+  int ncall =300;
  if (1) {
    FGrid->Barrier();
@@ -242,16 +251,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
+	autoView( U_v  , U[mu]  , CpuRead);
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    int i=s+Ls*ss;
 	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
 	  }
 	}
      }
-
+      
-      tmp =adj(U[mu])*src;
+      {
 	autoView( tmp_v  , tmp  , CpuWrite);
 	autoView( U_v  , U[mu]  , CpuRead);
 	autoView( src_v, src    , CpuRead);
 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 	  for(int s=0;s<Ls;s++){
 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 	  }
 	}
      }
      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -184,8 +184,10 @@ int main (int argc, char ** argv)
      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
-
+	       <<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
 	       <<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
      assert(nn==nn);
  }    
  Grid_finalize();
--- a/examples/Example_Laplacian_solver.cc
+++ b/examples/Example_Laplacian_solver.cc
@@ -4,7 +4,7 @@ using namespace Grid;
 template<class Field>
 void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
 {
-    RealD cp, c, alpha, d, beta, ssq, qq;
+    RealD cp, c, alpha, d, beta, ssq;
    RealD Tolerance=1.0e-10;
    int MaxIterations=10000;
--- a/examples/Example_wall_wall_3pt.cc
+++ b/examples/Example_wall_wall_3pt.cc
@@ -0,0 +1,539 @@
 /*
 * Warning: This code illustrative only: not well tested, and not meant for production use
 * without regression / tests being applied
 */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out + 2.0*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 void MakePhase(Coordinate mom,LatticeComplex &phase)
 {
  GridBase *grid = phase.Grid();
  auto latt_size = grid->GlobalDimensions();
  ComplexD ci(0.0,1.0);
  phase=Zero();
  LatticeComplex coor(phase.Grid());
  for(int mu=0;mu<Nd;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    phase = phase + (TwoPiL * mom[mu]) * coor;
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
  source=Zero();
  SpinColourMatrix kronecker; kronecker=1.0;
  pokeSite(kronecker,source,coor);
 }
 void GFWallSource(int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex one(grid); one = ComplexD(1.0,0.0);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  LatticeCoordinate(t,Tdir);
  one = where(t==Integer(tslice), one, zz);
  source = 1.0;
  source = source * one;
 }
 void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex noise(grid);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  RealD nrm=1.0/sqrt(2);
  bernoulli(RNG, noise); // 0,1 50:50
  noise = (2.*noise - Complex(1,1))*nrm;
  LatticeCoordinate(t,Tdir);
  noise = where(t==Integer(tslice), noise, zz);
  source = 1.0;
  source = source*noise;
  std::cout << " Z2 wall " << norm2(source) << std::endl;
 }
 void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
  Real width = 2.0;
  Real coeff = (width*width) / Real(4*Iterations);
  Field tmp(U.Grid());
  smeared=unsmeared;
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(smeared,tmp);
    smeared = smeared - coeff*tmp;
    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
  }
 }
 void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
 {
  LatticePropagator tmp(source.Grid());
  PointSource(site,source);
  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
  tmp = source;
  GaussianSmear(U,tmp,source);
  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
 }
 void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
 {
  Z2WallSource(RNG,tslice,source);
  auto tmp = source;
  GaussianSmear(U,tmp,source);
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
  assert(mom.size()==Nd);
  assert(mom[Tdir] == 0);
  GridBase * grid = spectator.Grid();
  LatticeInteger ts(grid);
  LatticeCoordinate(ts,Tdir);
  source = Zero();
  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
  LatticeComplex phase(grid);
  MakePhase(mom,phase);
  source = source *phase;
 }
 template<class Action>
 void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
 {
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
  LatticeFermion src4  (UGrid); 
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
      D.ImportPhysicalFermionSource(src4,src5);
      result5=Zero();
      schur(D,src5,result5,ZG);
      std::cout<<GridLogMessage
 	       <<"spin "<<s<<" color "<<c
 	       <<" norm2(src5d) "   <<norm2(src5)
               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
      D.ExportPhysicalFermionSolution(result5,result4);
      FermToProp<Action>(propagator,result4,s,c);
    }
  }
 }
 class MesonFile: Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
 };
 void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void Meson3pt(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaX},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaY},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaZ},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaT}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  int nt=q1.size();
  std::vector<Complex> meson_CF(nt);
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
      corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 int make_idx(int p, int m,int nmom)
 {
  if (m==0) return p;
  assert(p==0);
  return nmom + m - 1;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  // Double precision grids
  auto latt = GridDefaultLatt();
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  LatticeGaugeField Umu(UGrid);
  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
    FieldMetaData header;
    NerscIO::readConfiguration(Umu, header, argv[1]);
    config=argv[1];
  }
  else
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
    config="ColdConfig";
  }
  //  GaugeFix(Umu,Utmp);
  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  LinkSmear(nsmr,rho,Umu,Usmr);
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
  std::vector<RealD> cs   ({ 0.0,0.0,0.5} );  // DDM
  std::vector<int>   Ls_s ({ 16,16,12} );
  std::vector<GridCartesian *> FGrids;
  std::vector<GridRedBlackCartesian *> FrbGrids;
  std::vector<Coordinate> momenta;
  momenta.push_back(Coordinate({0,0,0,0}));
  momenta.push_back(Coordinate({1,0,0,0}));
  momenta.push_back(Coordinate({2,0,0,0}));
  int nmass = masses.size();
  int nmom  = momenta.size();
  std::vector<MobiusFermionR *> FermActs;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
    RealD mass = masses[m];
    RealD M5   = M5s[m];
    RealD b    = bs[m];
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
  LatticePropagator phased_prop(UGrid);
  int tslice = 0;
  int tseq=(tslice+16)%latt[Nd-1];
  //////////////////////////////////////////////////////////////////////
  // RNG seeded for Z2 wall
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticeComplex> phase(nmom,UGrid);
  for(int m=0;m<nmom;m++){
    MakePhase(momenta[m],phase[m]);
  }
  std::vector<LatticePropagator> Z2Props   (nmom+nmass-1,UGrid);
  std::vector<LatticePropagator> GFProps   (nmom+nmass-1,UGrid);
  for(int p=0;p<nmom;p++) {
    int m=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source * phase[p];
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  for(int m=1;m<nmass;m++) {
    int p=0;
    int idx = make_idx(p,m,nmom);
    phased_prop = z2wall_source;
    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
    phased_prop = gfwall_source;
    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
  }
  std::vector<std::vector<Propagator> > wsnk_z2Props(nmom+nmass-1);
  std::vector<std::vector<Propagator> > wsnk_gfProps(nmom+nmass-1);
  // Non-zero kaon and point and D two point
  // WW stick momentum on m1 (lighter)
  //     zero momentum on m2
  for(int m1=0;m1<nmass;m1++) {
  for(int m2=m1;m2<nmass;m2++) {
    int pmax = (m1==0)? nmom:1;
    for(int p=0;p<pmax;p++){
      std::stringstream ssg,ssz;
      std::stringstream wssg,wssz;
      int idx1 = make_idx(p,m1,nmom);
      int idx2 = make_idx(0,m2,nmom);
      /// Point sinks
      ssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
      ssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
      MesonTrace(ssz.str(),Z2Props[idx1],Z2Props[idx2],phase[p]); // Q1 is conjugated
      MesonTrace(ssg.str(),GFProps[idx1],GFProps[idx2],phase[p]); 
      /// Wall sinks
      wssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
      wssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
      phased_prop = GFProps[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(GFProps[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
      phased_prop = Z2Props[m2] * phase[p];
      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
      sliceSum(Z2Props[m1],wsnk_gfProps[m2],Tdir);
      WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
    }
  }}
  /////////////////////////////////////
  // Sequential solves
  /////////////////////////////////////
  LatticePropagator  seq_wsnk_z2src(UGrid);
  LatticePropagator  seq_wsnk_gfsrc(UGrid);
  LatticePropagator  seq_psnk_z2src(UGrid);
  LatticePropagator  seq_psnk_gfsrc(UGrid);
  LatticePropagator source(UGrid);
  for(int m=0;m<nmass-1;m++){
    int spect_idx = make_idx(0,m,nmom);
    int charm=nmass-1;
    SequentialSource(tseq,momenta[0],GFProps[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_gfsrc);
    SequentialSource(tseq,momenta[0],Z2Props[spect_idx],source);
    Solve(*FermActs[charm],source,seq_psnk_z2src);
    // Todo need wall sequential solve
    for(int p=0;p<nmom;p++){
      int active_idx = make_idx(p,0,nmom);
      std::stringstream seq_3pt_p_z2;
      std::stringstream seq_3pt_p_gf;
      std::stringstream seq_3pt_w_z2;
      std::stringstream seq_3pt_w_gf;
      seq_3pt_p_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_z2_meson.xml";
      seq_3pt_p_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_gf_meson.xml";
      seq_3pt_w_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_z2_meson.xml";
      seq_3pt_w_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_gf_meson.xml";
      Meson3pt(seq_3pt_p_gf.str(),GFProps[active_idx],seq_psnk_gfsrc,phase[p]);
      Meson3pt(seq_3pt_p_z2.str(),Z2Props[active_idx],seq_psnk_z2src,phase[p]);
    }    
  }
  Grid_finalize();
 }
--- a/examples/Example_wall_wall_spectrum.cc
+++ b/examples/Example_wall_wall_spectrum.cc
@@ -9,6 +9,7 @@ using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
@@ -55,6 +56,16 @@ void MakePhase(Coordinate mom,LatticeComplex &phase)
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
@@ -97,23 +108,23 @@ void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
-  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Ufix);
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
@@ -167,19 +178,21 @@ void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
-  LatticeFermion src4  (UGrid); 
+  LatticeFermion src4  (UGrid); src4 = Zero();
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  std::cout<<GridLogMessage<< " source4 "<<norm2(source)<<std::endl;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
-
+      std::cout<<GridLogMessage<< s<<c<<" src4 "<<norm2(src4)<<std::endl;
      D.ImportPhysicalFermionSource(src4,src5);
      std::cout<<GridLogMessage<< s<<c<<" src5 "<<norm2(src5)<<std::endl;
      result5=Zero();
      schur(D,src5,result5,ZG);
@@ -287,15 +300,10 @@ int main (int argc, char ** argv)
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  std::vector<int> seeds4({1,2,3,4}); 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  LatticeGaugeField Ufixed(UGrid);
+  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
@@ -308,13 +316,20 @@ int main (int argc, char ** argv)
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
-    //    SU<Nc>::HotConfiguration(RNG4,Umu);
+    config="ColdConfig";
    config="HotConfig";
  }
-  GaugeFix(Umu,Ufixed);
+  //  GaugeFix(Umu,Utmp);
-  Umu=Ufixed;
+  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  RealD plaq_gf =WilsonLoops<GimplR>::avgPlaquette(Umu);
  LinkSmear(nsmr,rho,Umu,Usmr);
  RealD plaq_smr=WilsonLoops<GimplR>::avgPlaquette(Usmr);
  std::cout << GridLogMessage << " GF Plaquette " <<plaq_gf<<std::endl;
  std::cout << GridLogMessage << " SM Plaquette " <<plaq_smr<<std::endl;
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
@@ -330,6 +345,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
@@ -339,30 +357,40 @@ int main (int argc, char ** argv)
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
-    FermActs.push_back(new MobiusFermionR(Umu,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator point_source(UGrid);
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
-  Coordinate Origin({0,0,0,0});
+  int tslice = 0;
-  PointSource   (Origin,point_source);
+  //////////////////////////////////////////////////////////////////////
-  Z2WallSource  (RNG4,0,z2wall_source);
+  // RNG seeded for Z2 wall
-  GFWallSource  (0,gfwall_source);
+  //////////////////////////////////////////////////////////////////////
-  
+  // You can manage seeds however you like.
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  // Recommend SeedUniqueString.
-  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
+  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  std::vector<LatticePropagator> GFProps   (nmass,UGrid);
  for(int m=0;m<nmass;m++) {
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<<std::endl;
    Solve(*FermActs[m],z2wall_source    ,Z2Props[m]);
    std::cout << GridLogMessage << " Mass " <<m << " gfwall source "<<norm2(gfwall_source)<<std::endl;
    Solve(*FermActs[m],gfwall_source    ,GFProps[m]);
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<< " " << norm2(gfwall_source)<<std::endl;
  }
@@ -383,14 +411,15 @@ int main (int argc, char ** argv)
    std::stringstream wssg,wssz;
    /// Point sinks
-    ssg<<config<< "_m" << m1 << "_m"<< m2 << "p_gf_meson.xml";
+    ssg<<config<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "p_z2_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
    MesonTrace(ssg.str(),GFProps[m1],GFProps[m2],phase);
    /// Wall sinks
-    wssg<<config<< "_m" << m1 << "_m"<< m2 << "w_gf_meson.xml";
+    wssg<<config<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
-    wssz<<config<< "_m" << m1 << "_m"<< m2 << "w_z2_meson.xml";
+    wssz<<config<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
    WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
    WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
--- a/scripts/hmc.sh
+++ b/scripts/hmc.sh
@@ -1,19 +1,27 @@
 #!/bin/bash
 LOG=$1
-SWEEPS=`grep dH $LOG | wc -l`
+SWEEPS=`grep dH.= $LOG | wc -l`
-SWEEPS=`expr $SWEEPS - 80`
+SWEEPS=`expr $SWEEPS - 100`
 echo
 echo $SWEEPS thermalised sweeps
 echo
-plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10} END { print S/NR} ' `
+plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12} END { print S/NR} ' `
-plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12 ; SS=SS+$12*$12 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Plaquette: $plaq (${plaqe})"
 echo
-dHv=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt(SS/NR) } ' `
+grep  Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12/20; if(NR%20==0){ print NR/20, " ", S; S=0;} } '  > plaq.binned
-edH=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$10)} END { print S/NR} '`
+
-echo "<e-dH>: $edH"
+plaq=`cat plaq.binned  | awk '{ S=S+$2} END { print S/NR} ' `
 plaqe=`cat plaq.binned | awk '{ S=S+$2 ; SS=SS+$2*$2 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Binned Plaquette: $plaq (${plaqe})"
 echo
 dHv=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+$16 ; SS=SS+$16*$16 } END { print sqrt(SS/NR) } ' `
 edH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16)} END { print S/NR} '`
 dedH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16); SS=SS+exp(-$16)*exp(-$16)} END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } '`
 echo "<e-dH>: $edH (${dedH})"
 echo "<rms dH>: $dHv"
 TRAJ=`grep Acc $LOG | wc -l`
@@ -22,12 +30,13 @@ PACC=`expr  100 \* ${ACC} / ${TRAJ} `
 echo
 echo "Acceptance $PACC %  $ACC / $TRAJ "
-grep Plaq $LOG | awk '{ print $10 }' | uniq > plaq.dat
+grep Plaq $LOG | awk '{ print $12 }' | uniq > plaq.dat
-grep dH $LOG | awk '{ print $10 }' > dH.dat
+grep dH.= $LOG | awk '{ print $16 }' > dH.dat
-echo set yrange [-0.2:1.0] > plot.gnu
+echo set yrange [0.58:0.60] > plot.gnu
 echo set terminal 'pdf' >> plot.gnu
 echo "f(x) =0.588" >> plot.gnu
 echo "set output 'plaq.${LOG}.pdf'" >> plot.gnu
-echo "plot 'plaq.dat' w l, 'dH.dat' w l " >> plot.gnu
+echo "plot 'plaq.dat' w l, f(x) " >> plot.gnu
 echo
 gnuplot plot.gnu >& gnu.errs
 open plaq.${LOG}.pdf
--- a/systems/Crusher/config-command
+++ b/systems/Crusher/config-command
@@ -0,0 +1,12 @@
 ../../configure --enable-comms=mpi-auto \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --enable-simd=GPU \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
 LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
 HIPFLAGS = --amdgpu-target=gfx90a
--- a/systems/Crusher/dwf.slurm
+++ b/systems/Crusher/dwf.slurm
@@ -0,0 +1,30 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 ##SBATCH -p ecp
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 1
 #SBATCH --exclusive  
 DIR=.
 module list
 #export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=1
 AT=8
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads ${AT} --grid 24.24.24.24 --shm-mpi 0 --mpi 1.1.1.1"
 srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Crusher/dwf4.slurm
+++ b/systems/Crusher/dwf4.slurm
@@ -0,0 +1,27 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 4
 #SBATCH --exclusive
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=4
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/systems/Crusher/dwf8.slurm
+++ b/systems/Crusher/dwf8.slurm
@@ -0,0 +1,27 @@
 #!/bin/bash
 # Begin LSF Directives
 #SBATCH -A LGT104
 #SBATCH -t 01:00:00
 ##SBATCH -U openmpThu
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
 #SBATCH -n 8
 #SBATCH --exclusive
 DIR=.
 module list
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=1
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads 8 --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
 srun --gpus-per-task 1 -n8 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`../CompactWilsonCloverFermionInstantiation.cc.master`