Test_evec_compression changes:

Added ability to choose one of a variety of preselected basis sizes from the command line Fine lanczos now checks enough evecs are generated and resizes the output to Nstop and not the actual amount that converged (which can be larger)
Test_evec_compression enhancements:
2026-02-12 01:40:54 +00:00 · 2022-04-06 06:33:26 -07:00 · 2022-03-29 06:16:15 -07:00 · 2022-03-14 06:45:28 -07:00 · 2022-02-22 14:25:27 -05:00 · 2022-02-16 14:01:43 +00:00
168 changed files with 14029 additions and 1302 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -34,6 +34,9 @@ directory

 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+#if defined __GNUC__ 
+#pragma GCC diagnostic ignored "-Wpsabi"
 #endif

 //disables and intel compiler specific warning (in json.hpp)
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
+#include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
      
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){

 	SE=Stencil_v.GetEntry(ptype,point,ss);
 	  
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;

    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;

-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];

 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
+  virtual ~LinearOperatorBase(){};
 };


@@ -507,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@@ -585,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
      
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -598,6 +600,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
      
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+template<class Field> using Preconditioner =  LinearFunction<Field> ;
+
+/*
 template<class Field> class Preconditioner :  public LinearFunction<Field> {
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
+*/

 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
+  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
+  virtual ~SparseMatrixBase() {};
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -292,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
+
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -37,6 +37,7 @@ template<class FieldD, class FieldF, typename std::enable_if< getPrecision<Field
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
  public:
+    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:
+    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -48,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+    RealD TrueResidual;

    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -67,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
    }
  
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
+    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
 	
    GridStopWatch TotalTimer;
@@ -79,6 +82,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;

    GridBase* DoublePrecGrid = src_d_in.Grid();
+
+    //Generate precision change workspaces
+    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
+    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
+
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
    
@@ -96,6 +104,7 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;

@@ -119,7 +128,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      
      sol_f = Zero();
@@ -129,6 +138,7 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);

      //Inner CG
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -137,7 +147,7 @@ NAMESPACE_BEGIN(Grid);
      
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -149,6 +159,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
+    TrueResidual = CG_d.TrueResidual;

    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;

-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -183,6 +183,9 @@ public:
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }

+    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
+    
+  
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,411 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
+//The residual is stored in single precision, but the search directions and solution are stored in double precision. 
+//Every update_freq iterations the residual is corrected in double precision. 
+    
+//For safety the a final regular CG is applied to clean up if necessary
+
+//Linop to add shift to input linop, used in cleanup CG
+namespace ConjugateGradientMultiShiftMixedPrecSupport{
+template<typename Field>
+class ShiftedLinop: public LinearOperatorBase<Field>{
+public:
+  LinearOperatorBase<Field> &linop_base;
+  RealD shift;
+
+  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
+
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
+  
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+
+  void HermOp(const Field &in, Field &out){
+    linop_base.HermOp(in, out);
+    axpy(out, shift, in, out);
+  }    
+
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+};
+};
+
+
+template<class FieldD, class FieldF,
+	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
+					     public OperatorFunction<FieldD>
+{
+public:                                                
+
+  using OperatorFunction<FieldD>::operator();
+
+  RealD   Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
+  int verbose;
+  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;
+
+  int ReliableUpdateFreq; //number of iterations between reliable updates
+
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  LinearOperatorBase<FieldF> &Linop_f; //single precision
+
+  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
+				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
+				       int _ReliableUpdateFreq
+				       ) : 
+    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
+  { 
+    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
+  {
+    GridBase *grid = src.Grid();
+    int nshift = shifts.order;
+    std::vector<FieldD> results(nshift,grid);
+    (*this)(Linop,src,results,psi);
+  }
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
+  {
+    int nshift = shifts.order;
+
+    (*this)(Linop,src,results);
+  
+    psi = shifts.norm*src;
+    for(int i=0;i<nshift;i++){
+      psi = psi + shifts.residues[i]*results[i];
+    }
+
+    return;
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
+  { 
+    GridBase *DoublePrecGrid = src_d.Grid();
+    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
+    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
+
+    ////////////////////////////////////////////////////////////////////////
+    // Convenience references to the info stored in "MultiShiftFunction"
+    ////////////////////////////////////////////////////////////////////////
+    int nshift = shifts.order;
+
+    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+    std::vector<RealD> &mresidual(shifts.tolerances);
+    std::vector<RealD> alpha(nshift,1.0);
+
+    //Double precision search directions
+    FieldD p_d(DoublePrecGrid);
+    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
+
+    FieldD tmp_d(DoublePrecGrid);
+    FieldD r_d(DoublePrecGrid);
+    FieldD mmp_d(DoublePrecGrid);
+
+    assert(psi_d.size()==nshift);
+    assert(mass.size()==nshift);
+    assert(mresidual.size()==nshift);
+  
+    // dynamic sized arrays on stack; 2d is a pain with vector
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
+  
+    const int       primary =0;
+  
+    //Primary shift fields CG iteration
+    RealD a,b,c,d;
+    RealD cp,bp,qq; //prev
+  
+    // Matrix mult fields
+    FieldF r_f(SinglePrecGrid);
+    FieldF p_f(SinglePrecGrid);
+    FieldF tmp_f(SinglePrecGrid);
+    FieldF mmp_f(SinglePrecGrid);
+    FieldF src_f(SinglePrecGrid);
+    precisionChange(src_f, src_d, wk_f_from_d);
+
+    // Check lightest mass
+    for(int s=0;s<nshift;s++){
+      assert( mass[s]>= mass[primary] );
+      converged[s]=0;
+    }
+  
+    // Wire guess to zero
+    // Residuals "r" are src
+    // First search direction "p" is also src
+    cp = norm2(src_d);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi_d[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
+    for(int s=0;s<nshift;s++){
+      rsq[s] = cp * mresidual[s] * mresidual[s];
+      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
+      ps_d[s] = src_d;
+    }
+    // r and p for primary
+    r_f=src_f; //residual maintained in single
+    p_f=src_f;
+    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
+  
+    //MdagM+m[0]
+    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
+    axpy(mmp_f,mass[0],p_f,mmp_f);
+    RealD rn = norm2(p_f);
+    d += rn*mass[0];
+
+    b = -cp /d;
+  
+    // Set up the various shift variables
+    int       iz=0;
+    z[0][1-iz] = 1.0;
+    z[0][iz]   = 1.0;
+    bs[0]      = b;
+    for(int s=1;s<nshift;s++){
+      z[s][1-iz] = 1.0;
+      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+      bs[s]      = b*z[s][iz]; 
+    }
+  
+    // r += b[0] A.p[0]
+    // c= norm(r)
+    c=axpy_norm(r_f,b,mmp_f,r_f);
+  
+    for(int s=0;s<nshift;s++) {
+      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
+    }
+  
+    ///////////////////////////////////////
+    // Timers
+    ///////////////////////////////////////
+    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
+
+    SolverTimer.Start();
+  
+    // Iteration loop
+    int k;
+  
+    for (k=1;k<=MaxIterations;k++){    
+      a = c /cp;
+
+      //Update double precision search direction by residual
+      PrecChangeTimer.Start();
+      precisionChange(r_d, r_f, wk_d_from_f);
+      PrecChangeTimer.Stop();
+
+      AXPYTimer.Start();
+      axpy(p_d,a,p_d,r_d); 
+
+      for(int s=0;s<nshift;s++){
+	if ( ! converged[s] ) { 
+	  if (s==0){
+	    axpy(ps_d[s],a,ps_d[s],r_d);
+	  } else{
+	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
+	  }
+	}
+      }
+      AXPYTimer.Stop();
+
+      PrecChangeTimer.Start();
+      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
+      PrecChangeTimer.Stop();
+
+      cp=c;
+      MatrixTimer.Start();  
+      Linop_f.HermOp(p_f,mmp_f); 
+      d=real(innerProduct(p_f,mmp_f));    
+      MatrixTimer.Stop();  
+
+      AXPYTimer.Start();
+      axpy(mmp_f,mass[0],p_f,mmp_f);
+      AXPYTimer.Stop();
+      RealD rn = norm2(p_f);
+      d += rn*mass[0];
+    
+      bp=b;
+      b=-cp/d;
+    
+      // Toggle the recurrence history
+      bs[0] = b;
+      iz = 1-iz;
+      ShiftTimer.Start();
+      for(int s=1;s<nshift;s++){
+	if((!converged[s])){
+	  RealD z0 = z[s][1-iz];
+	  RealD z1 = z[s][iz];
+	  z[s][iz] = z0*z1*bp
+	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+	}
+      }
+      ShiftTimer.Stop();
+
+      //Update double precision solutions
+      AXPYTimer.Start();
+      for(int s=0;s<nshift;s++){
+	int ss = s;
+	if( (!converged[s]) ) { 
+	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
+	}
+      }
+
+      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
+      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
+      AXPYTimer.Stop();
+
+      c = c_f;
+
+      if(k % ReliableUpdateFreq == 0){
+	//Replace r with true residual
+	MatrixTimer.Start();  
+	Linop_d.HermOp(psi_d[0],mmp_d); 
+	MatrixTimer.Stop();  
+
+	AXPYTimer.Start();
+	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
+
+	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
+	AXPYTimer.Stop();
+
+	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
+	
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r_d, wk_f_from_d);
+	PrecChangeTimer.Stop();
+	c = c_d;
+      }
+    
+      // Convergence checks
+      int all_converged = 1;
+      for(int s=0;s<nshift;s++){
+      
+	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
+	
+	  RealD css  = c * z[s][iz]* z[s][iz];
+	
+	  if(css<rsq[s]){
+	    if ( ! converged[s] )
+	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    converged[s]=1;
+	  } else {
+	    all_converged=0;
+	  }
+
+	}
+      }
+
+      if ( all_converged ){
+
+	SolverTimer.Stop();
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+      
+	// Check answers 
+	for(int s=0; s < nshift; s++) { 
+	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
+	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
+	  axpy(r_d,-alpha[s],src_d,tmp_d);
+	  RealD rn = norm2(r_d);
+	  RealD cn = norm2(src_d);
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
+
+	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
+	  if(rn >= rsq[s]){
+	    CleanupTimer.Start();
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
+
+	    //Setup linear operators for final cleanup
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
+					       
+	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
+	    cg(src_d, psi_d[s]);
+	    
+	    TrueResidualShift[s] = cg.TrueResidual;
+	    CleanupTimer.Stop();
+	  }
+	}
+
+	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
+	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
+
+	IterationsToComplete = k;	
+
+	return;
+      }
+
+   
+    }
+    // ugly hack
+    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
+    //  assert(0);
+  }
+
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
+  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };

@@ -57,6 +60,7 @@ private:
  const unsigned int       N;

 public:
+  using LinearFunction<Field>::operator();

  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@@ -87,6 +91,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  
+  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -44,6 +44,7 @@ public:
 				  int, MinRes);    // Must restart
 };

+//This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -67,6 +68,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -97,6 +99,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
+  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -153,6 +156,7 @@ public:
      _coarse_relax_tol(coarse_relax_tol)  
  {    };

+  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -179,8 +183,16 @@ public:
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
+
+  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
+  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
+
+  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
+  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
+  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
+    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -199,13 +211,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
 	     <<std::endl;
-    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -283,6 +295,10 @@ public:
    evals_coarse.resize(0);
  };

+  //The block inner product is the inner product on the fine grid locally summed over the blocks
+  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
+  //vectors under the block inner product. This step must be performed after computing the fine grid
+  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -326,6 +342,8 @@ public:
    }
  }

+  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
+  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -374,18 +392,23 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
+
+
+  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
+  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
+  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////

-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 

    evals_coarse.resize(Nm);
@@ -393,6 +416,7 @@ public:

    CoarseField src(_CoarseGrid);     src=1.0; 

+    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -403,6 +427,14 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
+
+  //Get the fine eigenvector 'i' by reconstruction
+  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
+    blockPromote(evec_coarse[i],evec,subspace);  
+    eval = evals_coarse[i];
+  }
+    
+    
 };

 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -30,6 +30,8 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 

+      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
+      
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
 	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
+    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;

@@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
    
@@ -170,7 +171,7 @@ public:

    LinalgTimer.Start();

-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);

    //p[0],q[0],qq[0] 
@@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);

      LinalgTimer.Start();
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
+#undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : PrintBytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
+  uint64_t cacheBytes;
+  cacheBytes = CacheBytes[Cpu];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
+  cacheBytes = CacheBytes[Acc];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
+  cacheBytes = CacheBytes[Shared];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
+  
+#ifdef GRID_CUDA
+  cuda_mem();
+#endif
+  
 }

 //////////////////////////////////////////////////////////////////////
@@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
-
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
+  total_device+=bytes;
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
-    total_device+=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"AcceleratorAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
+  total_device-=bytes;
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
-    total_device-=bytes;
-    //    PrintBytes();
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"AcceleratorFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
+  total_shared+=bytes;
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
-    total_shared+=bytes;
-    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
-    //    PrintBytes();
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"SharedAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
+  total_shared-=bytes;
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
-    total_shared-=bytes;
-    //    PrintBytes();
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"SharedFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
+  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
-    total_host+=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
+  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
-    total_host-=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
+  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
-    total_host+=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuAllocate "<<std::endl;
+  PrintBytes();
+#endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
+  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
-    total_host-=bytes;
  }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #endif

@@ -115,7 +159,6 @@ void MemoryManager::Init(void)

  char * str;
  int Nc;
-  int NcS;
  
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
@@ -181,13 +224,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
 #endif
 }

-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -211,6 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries

  if ( entries[v].valid ) {
    ret = entries[v].address;
+    cacheBytes -= entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@@ -219,6 +263,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
+  cacheBytes += bytes;

  return ret;
 }
@@ -228,13 +273,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
 #endif
 }

-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -243,6 +288,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
+      cacheBytes -= entries[e].bytes;
      return entries[e].address;
    }
  }
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -82,14 +82,15 @@ private:
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
+  static uint64_t CacheBytes[NallocType];

  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;

  static void PrintBytes(void);
 public:
@@ -169,6 +170,7 @@ private:

 public:
  static void Print(void);
+  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -3,7 +3,7 @@

 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)


@@ -429,6 +429,7 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
+  PrintBytes();
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
@@ -473,6 +474,32 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
  }
 }

+void MemoryManager::PrintState(void* _CpuPtr)
+{
+  uint64_t CpuPtr = (uint64_t)_CpuPtr;
+
+  if ( EntryPresent(CpuPtr) ){
+    auto AccCacheIterator = EntryLookup(CpuPtr);
+    auto & AccCache = AccCacheIterator->second;
+    std::string str;
+    if ( AccCache.state==Empty    ) str = std::string("Empty");
+    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
+    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
+    if ( AccCache.state==Consistent)str = std::string("Consistent");
+    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
+
+    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
+    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
+    << "\t" << AccCache.cpuLock
+    << "\t" << AccCache.accLock
+    << "\t" << AccCache.LRU_valid<<std::endl;
+
+  } else {
+    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
+  }
+}
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -16,6 +16,10 @@ uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
+void  MemoryManager::PrintState(void* CpuPtr)
+{
+std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
+};
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};

--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -388,8 +388,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    // TODO : make a OMP loop on CPU, call threaded bcopy
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
+    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
-    acceleratorCopySynchronise(); // MPI prob slower
  }

  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -400,6 +400,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
+  //   std::cout << "Copy Synchronised\n"<<std::endl;
+  acceleratorCopySynchronise();
+
  int nreq=list.size();

  if (nreq==0) return;
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
+#include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -88,6 +88,13 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
+
+  // Helper function to print the state of this object in the AccCache
+  void PrintCacheState(void)
+  {
+    MemoryManager::PrintState(this->_odata);
+  }
+
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -0,0 +1,42 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_crc.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+{
+  autoView( buf_v , buf, CpuRead);
+  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
+}
+
+#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+
+NAMESPACE_END(Grid);
+
+
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
-  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
  
  if (warpSize != WARP_SIZE) {
@@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
+  if ( threads*sizeofsobj > sharedMemPerBlock ) {
+    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
+    exit(EXIT_FAILURE);
+  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -32,8 +32,9 @@
 #include <random>

 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
+#include <Grid/random/gaussian.h>

 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -142,8 +143,8 @@ public:

  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
-  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  std::vector<Grid::gaussian_distribution<RealD> >       _gaussian;
+  //  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;

  ///////////////////////
@@ -243,8 +244,8 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }

@@ -357,8 +358,8 @@ public:

    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }

@@ -515,11 +516,11 @@ public:

 template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
 template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
-template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}

 template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
 template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
-template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }

 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }

+template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
+{
+  half.Checkerboard() = cb;
+  autoView(half_v, half, AcceleratorWrite);
+  autoView(full_v, full, AcceleratorRead);
+  Coordinate rdim_full             = full.Grid()->_rdimensions;
+  Coordinate rdim_half             = half.Grid()->_rdimensions;
+  unsigned long ndim_half          = half.Grid()->_ndimension;
+  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
+  Coordinate ostride_half          = half.Grid()->_ostride;
+  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
+    
+    Coordinate coor;
+    int cbos;
+    int linear=0;
+
+    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
+    assert(coor.size()==ndim_half);
+
+    for(int d=0;d<ndim_half;d++){ 
+      if(checker_dim_mask_half[d]) linear += coor[d];
+    }
+    cbos = (linear&0x1);
+
+    if (cbos==cb) {
+      int ssh=0;
+      for(int d=0;d<ndim_half;d++) {
+        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
+        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
+      }
+      coalescedWrite(half_v[ssh],full_v(ss));
+    }
+  });
+}
+template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
+{
+  int cb = half.Checkerboard();
+  autoView(half_v , half, AcceleratorRead);
+  autoView(full_v , full, AcceleratorWrite);
+  Coordinate rdim_full             = full.Grid()->_rdimensions;
+  Coordinate rdim_half             = half.Grid()->_rdimensions;
+  unsigned long ndim_half          = half.Grid()->_ndimension;
+  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
+  Coordinate ostride_half          = half.Grid()->_ostride;
+  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
+
+    Coordinate coor;
+    int cbos;
+    int linear=0;
+  
+    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
+    assert(coor.size()==ndim_half);
+
+    for(int d=0;d<ndim_half;d++){ 
+      if(checker_dim_mask_half[d]) linear += coor[d];
+    }
+    cbos = (linear&0x1);
+
+    if (cbos==cb) {
+      int ssh=0;
+      for(int d=0;d<ndim_half;d++){
+        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
+        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
+      }
+      coalescedWrite(full_v[ss],half_v(ssh));
+    }
+
+  });
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
@@ -785,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int


 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;

@@ -1010,54 +1080,96 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }

-//Convert a Lattice from one precision to another
-template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-{
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
-  for(int d=0;d<out.Grid()->Nd();d++){
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
+class precisionChangeWorkspace{
+  std::pair<Integer,Integer>* fmap_device; //device pointer
+public:
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
+    assert(out_grid->Nd() == in_grid->Nd());
+    for(int d=0;d<out_grid->Nd();d++){
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
    }
-  out.Checkerboard() = in.Checkerboard();
-  GridBase *in_grid=in.Grid();
-  GridBase *out_grid = out.Grid();
+    int Nsimd_out = out_grid->Nsimd();

-  typedef typename VobjOut::scalar_object SobjOut;
-  typedef typename VobjIn::scalar_object SobjIn;
+    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
+    for(int lane=0; lane < out_grid->Nsimd(); lane++)
+      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
  
-  int ndim = out.Grid()->Nd();
-  int out_nsimd = out_grid->Nsimd();
-    
-  std::vector<Coordinate > out_icoor(out_nsimd);
-      
-  for(int lane=0; lane < out_nsimd; lane++){
-    out_icoor[lane].resize(ndim);
-    out_grid->iCoorFromIindex(out_icoor[lane], lane);
-  }
-        
-  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
-  unvectorizeToLexOrdArray(in_slex_conv, in);
-    
-  autoView( out_v , out, CpuWrite);
+    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
-    Coordinate out_ocoor(ndim);
-    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
+	Coordinate out_ocorr; 
+	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
      
-    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
+	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
+	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
+	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	
-    Coordinate lcoor(out_grid->Nd());
-      
-    for(int lane=0; lane < out_nsimd; lane++){
-      for(int mu=0;mu<ndim;mu++)
-	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
-	
-      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
-      ptrs[lane] = &in_slex_conv[llex];
+	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
+	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
+	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
+	  int in_oidx = 0, in_lane = 0;
+	  for(int d=0;d<in_grid->_ndimension;d++){
+	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
+	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
+	  }
+	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
+	}
+      });
+
+    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
+    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
+    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
+    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
+  }
+
+  //Prevent moving or copying
+  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
+  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
+  
+  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
+
+  ~precisionChangeWorkspace(){
+    acceleratorFreeDevice(fmap_device);
+  }
+};
+
+
+//Convert a lattice of one precision to another. The input workspace contains the mapping data.
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
+  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  out.Checkerboard() = in.Checkerboard();
+  constexpr int Nsimd_out = VobjOut::Nsimd();
+
+  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
+
+  //Do the copy/precision change
+  autoView( out_v , out, AcceleratorWrite);
+  autoView( in_v , in, AcceleratorRead);
+
+  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
+      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
+      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
+	int in_oidx = fmap_osite[out_lane].first;
+	int in_lane = fmap_osite[out_lane].second;
+	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
-    merge(out_v[out_oidx], ptrs, 0);
    });
 }

+//Convert a Lattice from one precision to another
+//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
+  precisionChange(out, in, workspace);
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
+GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");

 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
+  GridLogHMC.Active(1);

  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
+extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;

 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -576,6 +576,8 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
+  // in principle should do the line below, but that breaks backard compatibility with old data
+  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -39,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
-
  typedef Lattice<vLorentzColourMatrixD> GaugeField;

+  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
+  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
+
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -198,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
      
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
+#define GparityFlavourIndex (0)

 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;

+const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;


+template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;

 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;

+//G-parity flavour matrix
+typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
+typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
+
+typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
+typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
+
+
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -221,6 +236,16 @@ typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;

+//G-parity flavour vector
+typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
+typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
+typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
+
+typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
+typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
+typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
+
+    
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
 typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);

 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  GparityWilsonImplParams() : twists(Nd, 0) {};
 };
  
@@ -65,7 +66,8 @@ struct StaggeredImplParams {
 				    RealD, tolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq);
+				    int,   BoundsCheckFreq,
+				    RealD, BoundsCheckTol);
    
  // MaxIter and tolerance, vectors??
    
@@ -76,16 +78,62 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20)
+				int _BoundsCheckFreq=20,
+				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
 	degree(_degree),
        precision(_precision),
+        BoundsCheckFreq(_BoundsCheckFreq),
+        BoundsCheckTol(_BoundsCheckTol){};
+  };
+
+
+  /*Action parameters for the generalized rational action
+    The approximation is for (M^dag M)^{1/inv_pow}
+    where inv_pow is the denominator of the fractional power.
+    Default inv_pow=2 for square root, making this equivalent to 
+    the OneFlavourRational action
+  */
+    struct RationalActionParams : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
+				    int, inv_pow, 
+				    RealD, lo, //low eigenvalue bound of rational approx
+				    RealD, hi, //high eigenvalue bound of rational approx
+				    int,   MaxIter,  //maximum iterations in msCG
+				    RealD, action_tolerance,  //msCG tolerance in action evaluation
+				    int,   action_degree, //rational approx tolerance in action evaluation
+				    RealD, md_tolerance,  //msCG tolerance in MD integration
+				    int,   md_degree, //rational approx tolerance in MD integration
+				    int,   precision, //precision of floating point arithmetic
+				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
+  // constructor 
+  RationalActionParams(int _inv_pow = 2,
+		       RealD _lo      = 0.0, 
+		       RealD _hi      = 1.0, 
+		       int _maxit     = 1000,
+		       RealD _action_tolerance      = 1.0e-8, 
+		       int _action_degree    = 10,
+		       RealD _md_tolerance      = 1.0e-8, 
+		       int _md_degree    = 10,
+		       int _precision = 64,
+		       int _BoundsCheckFreq=20)
+    : inv_pow(_inv_pow), 
+      lo(_lo),
+      hi(_hi),
+      MaxIter(_maxit),
+      action_tolerance(_action_tolerance),
+      action_degree(_action_degree),
+      md_tolerance(_md_tolerance),
+      md_degree(_md_degree),
+      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };

+
+  
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@@ -0,0 +1,240 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
+
+    Copyright (C) 2020 - 2022
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Nils Meyer <nils.meyer@ur.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
+//
+// Modifications done here:
+//
+// Original: clover term = 12x12 matrix per site
+//
+// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
+// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
+// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
+//
+// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
+// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
+//
+// Words per site and improvement compared to original (combined with the input and output spinors):
+//
+// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
+// - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
+// - Here:     2*12 + 42    =  66 words -> 2.55 x less
+//
+// These improvements directly translate to wall-clock time
+//
+// Data layout:
+//
+// - diagonal and triangle part as separate lattice fields,
+//   this was faster than as 1 combined field on all tested machines
+// - diagonal: as expected
+// - triangle: store upper right triangle in row major order
+// - graphical:
+//        0  1  2  3  4
+//           5  6  7  8
+//              9 10 11 = upper right triangle indices
+//                12 13
+//                   14
+//     0
+//        1
+//           2
+//              3       = diagonal indices
+//                 4
+//                    5
+//     0
+//     1  5
+//     2  6  9          = lower left triangle indices
+//     3  7 10 12
+//     4  8 11 13 14
+//
+// Impact on total memory consumption:
+// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
+// - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
+//           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
+//                                                                 =  84 complex words per site
+
+template<class Impl>
+class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
+                                   public WilsonCloverHelpers<Impl>,
+                                   public CompactWilsonCloverHelpers<Impl> {
+  /////////////////////////////////////////////
+  // Sizes
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  /////////////////////////////////////////////
+  // Type definitions
+  /////////////////////////////////////////////
+
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonFermion<Impl>              WilsonBase;
+  typedef WilsonCloverHelpers<Impl>        Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  /////////////////////////////////////////////
+  // Constructors
+  /////////////////////////////////////////////
+
+public:
+
+  CompactWilsonCloverFermion(GaugeField& _Umu,
+			    GridCartesian& Fgrid,
+			    GridRedBlackCartesian& Hgrid,
+			    const RealD _mass,
+			    const RealD _csw_r = 0.0,
+			    const RealD _csw_t = 0.0,
+			    const RealD _cF = 1.0,
+			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
+			    const ImplParams& impl_p = ImplParams());
+
+  /////////////////////////////////////////////
+  // Member functions (implementing interface)
+  /////////////////////////////////////////////
+
+public:
+
+  virtual void Instantiatable() {};
+  int          ConstEE()     override { return 0; };
+  int          isTrivialEE() override { return 0; };
+
+  void Dhop(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
+
+  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
+
+  void M(const FermionField& in, FermionField& out) override;
+
+  void Mdag(const FermionField& in, FermionField& out) override;
+
+  void Meooe(const FermionField& in, FermionField& out) override;
+
+  void MeooeDag(const FermionField& in, FermionField& out) override;
+
+  void Mooee(const FermionField& in, FermionField& out) override;
+
+  void MooeeDag(const FermionField& in, FermionField& out) override;
+
+  void MooeeInv(const FermionField& in, FermionField& out) override;
+
+  void MooeeInvDag(const FermionField& in, FermionField& out) override;
+
+  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
+
+  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
+
+  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
+
+  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
+
+  /////////////////////////////////////////////
+  // Member functions (internals)
+  /////////////////////////////////////////////
+
+  void MooeeInternal(const FermionField&        in,
+                     FermionField&              out,
+                     const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle);
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+  void ImportGauge(const GaugeField& _Umu) override;
+
+  /////////////////////////////////////////////
+  // Helpers
+  /////////////////////////////////////////////
+
+private:
+
+  template<class Field>
+  const MaskField* getCorrectMaskField(const Field &in) const {
+    if(in.Grid()->_isCheckerBoarded) {
+      if(in.Checkerboard() == Odd) {
+        return &this->BoundaryMaskOdd;
+      } else {
+        return &this->BoundaryMaskEven;
+      }
+    } else {
+      return &this->BoundaryMask;
+    }
+  }
+
+  template<class Field>
+  void ApplyBoundaryMask(Field& f) {
+    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
+    assert(m != nullptr);
+    CompactHelpers::ApplyBoundaryMask(f, *m);
+  }
+
+  /////////////////////////////////////////////
+  // Member Data
+  /////////////////////////////////////////////
+
+public:
+
+  RealD csw_r;
+  RealD csw_t;
+  RealD cF;
+
+  bool open_boundaries;
+
+  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
+  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+
+  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
+  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
+
+  FermionField Tmp;
+
+  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -153,6 +154,23 @@ typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoInd
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;

+// Compact Clover fermions
+typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
+typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
+typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
+
+typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
+typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
+typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
+
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
+
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
+
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,6 +30,18 @@ directory

 NAMESPACE_BEGIN(Grid);

+/*
+  Policy implementation for G-parity boundary conditions
+
+  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
+  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
+  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
+  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
+  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
+
+  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
+ */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));

-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction

    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));

-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-
+    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);

 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }

+
+  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
+  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
+    autoView(poke_f0_v, poke_f0, CpuRead);
+    autoView(poke_f1_v, poke_f1, CpuRead);
+    autoView(Uds_v, Uds, CpuWrite);
+    thread_foreach(ss,poke_f0_v,{
+	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
+	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
+      });
+  }
+    
+
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -208,13 +233,18 @@ public:
   
    Lattice<iScalar<vInteger> > coor(GaugeGrid);

-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+    for(int mu=0;mu<Nd-1;mu++){

+      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
+      }
          
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
     
+      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -260,6 +290,38 @@ public:
        });
      }
    }
+
+    { //periodic / antiperiodic temporal BCs
+      int mu = Nd-1;
+      int L   = GaugeGrid->GlobalDimensions()[mu];
+      int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
+      
+      GaugeLinkField *Upoke = &U;
+
+      if(Params.twists[mu]){ //antiperiodic
+	Utmp =  where(coor == Lmu, -U, U);
+	Upoke = &Utmp;
+      }
+    
+      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
+
+      //Get the barrel-shifted field
+      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
+      Upoke = &Utmp;
+
+      if(Params.twists[mu]){
+	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
+	Upoke = &U;
+      }
+      
+      Uconj = conjugate(*Upoke);
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
+    }
  }
      
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -300,27 +362,47 @@ public:
  }
 
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
    int Ls=Btilde.Grid()->_fdimensions[0];
    
-    GaugeLinkField tmp(mat.Grid());
-    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
-      autoView( Atilde_v , Atilde, CpuRead);
-      autoView( Btilde_v , Btilde, CpuRead);
-      thread_for(ss,tmp.Grid()->oSites(),{
-	  for (int s = 0; s < Ls; s++) {
-	    int sF = s + Ls * ss;
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      GridBase *GaugeGrid = mat.Grid();
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+
+      if( Params.twists[mu] ){
+	LatticeCoordinate(coor,mu);
      }
+
+      autoView( mat_v , mat, AcceleratorWrite);
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
+  	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+  	  zeroit(sum);
+  	  for(int s=0;s<Ls;s++){
+  	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+	      //Flavor 0
+  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
+  	      sum = sum + outerProduct(bb,aa);
+
+  	      //Flavor 1
+  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
+  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
+  	      sum = sum + conjugate(outerProduct(bb,aa));
+  	    }
+  	  }	    
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
-    PokeIndex<LorentzIndex>(mat, tmp, mu);
-    return;
  }

+
+  
+
+  
 };

 typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -4,10 +4,11 @@

    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h

-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022

    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -29,7 +30,8 @@

 #pragma once

-#include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

@@ -50,18 +52,15 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////

 template <class Impl>
-class WilsonCloverFermion : public WilsonFermion<Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>,
+                            public WilsonCloverHelpers<Impl>
 {
 public:
-  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  template <typename vtype>
-  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
-  typedef iImplClover<Simd> SiteCloverType;
-  typedef Lattice<SiteCloverType> CloverFieldType;
+  INHERIT_CLOVER_TYPES(Impl);

-public:
  typedef WilsonFermion<Impl>       WilsonBase;
+  typedef WilsonCloverHelpers<Impl> Helpers;

  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@@ -72,42 +71,7 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
-                                                                                     Fgrid,
-                                                                                     Hgrid,
-                                                                                     _mass, impl_p, clover_anisotropy),
-                                                                 CloverTerm(&Fgrid),
-                                                                 CloverTermInv(&Fgrid),
-                                                                 CloverTermEven(&Hgrid),
-                                                                 CloverTermOdd(&Hgrid),
-                                                                 CloverTermInvEven(&Hgrid),
-                                                                 CloverTermInvOdd(&Hgrid),
-                                                                 CloverTermDagEven(&Hgrid),
-                                                                 CloverTermDagOdd(&Hgrid),
-                                                                 CloverTermInvDagEven(&Hgrid),
-                                                                 CloverTermInvDagOdd(&Hgrid)
-  {
-    assert(Nd == 4); // require 4 dimensions
-
-    if (clover_anisotropy.isAnisotropic)
-    {
-      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
-      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
-    }
-    else
-    {
-      csw_r = _csw_r * 0.5;
-      diag_mass = 4.0 + _mass;
-    }
-    csw_t = _csw_t * 0.5;
-
-    if (csw_r == 0)
-      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
-    if (csw_t == 0)
-      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
-
-    ImportGauge(_Umu);
-  }
+                      const ImplParams &impl_p = ImplParams());

  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@@ -124,250 +88,21 @@ public:
  void ImportGauge(const GaugeField &_Umu);

  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
-  {
-    conformable(X.Grid(), Y.Grid());
-    conformable(X.Grid(), force.Grid());
-    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
-    GaugeField clover_force(force.Grid());
-    PropagatorField Lambda(force.Grid());
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);

-    // Guido: Here we are hitting some performance issues:
-    // need to extract the components of the DoubledGaugeField
-    // for each call
-    // Possible solution
-    // Create a vector object to store them? (cons: wasting space)
-    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
-
-    Impl::extractLinkField(U, this->Umu);
-
-    force = Zero();
-    // Derivative of the Wilson hopping term
-    this->DhopDeriv(force, X, Y, dag);
-
-    ///////////////////////////////////////////////////////////
-    // Clover term derivative
-    ///////////////////////////////////////////////////////////
-    Impl::outerProductImpl(Lambda, X, Y);
-    //std::cout << "Lambda:" << Lambda << std::endl;
-
-    Gamma::Algebra sigma[] = {
-        Gamma::Algebra::SigmaXY,
-        Gamma::Algebra::SigmaXZ,
-        Gamma::Algebra::SigmaXT,
-        Gamma::Algebra::MinusSigmaXY,
-        Gamma::Algebra::SigmaYZ,
-        Gamma::Algebra::SigmaYT,
-        Gamma::Algebra::MinusSigmaXZ,
-        Gamma::Algebra::MinusSigmaYZ,
-        Gamma::Algebra::SigmaZT,
-        Gamma::Algebra::MinusSigmaXT,
-        Gamma::Algebra::MinusSigmaYT,
-        Gamma::Algebra::MinusSigmaZT};
-
-    /*
-      sigma_{\mu \nu}=
-      | 0         sigma[0]  sigma[1]  sigma[2] |
-      | sigma[3]    0       sigma[4]  sigma[5] |
-      | sigma[6]  sigma[7]     0      sigma[8] |
-      | sigma[9]  sigma[10] sigma[11]   0      |
-    */
-
-    int count = 0;
-    clover_force = Zero();
-    for (int mu = 0; mu < 4; mu++)
-    {
-      force_mu = Zero();
-      for (int nu = 0; nu < 4; nu++)
-      {
-        if (mu == nu)
-        continue;
-        
-        RealD factor;
-        if (nu == 4 || mu == 4)
-        {
-          factor = 2.0 * csw_t;
-        }
-        else
-        {
-          factor = 2.0 * csw_r;
-        }
-        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
-        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
-        count++;
-      }
-
-      pokeLorentz(clover_force, U[mu] * force_mu, mu);
-    }
-    //clover_force *= csw;
-    force += clover_force;
-  }
-
-  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
-  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
-  {
-    conformable(lambda.Grid(), U[0].Grid());
-    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
-    // insertion in upper staple
-    // please check redundancy of shift operations
-
-    // C1+
-    tmp = lambda * U[nu];
-    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
-
-    // C2+
-    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
-    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
-
-    // C3+
-    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
-    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
-
-    // C4+
-    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
-
-    // insertion in lower staple
-    // C1-
-    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
-
-    // C2-
-    tmp = adj(lambda) * U[nu];
-    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
-
-    // C3-
-    tmp = lambda * U[nu];
-    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
-
-    // C4-
-    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
-
-    return out;
-  }
-
-protected:
+public:
  // here fixing the 4 dimensions, make it more general?

  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
-  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
-  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
-  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
-  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
-
- public:
-  // eventually these can be compressed into 6x6 blocks instead of the 12x12
-  // using the DeGrand-Rossi basis for the gamma matrices
-  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F.Grid());
-    T = Zero();
-    autoView(T_v,T,AcceleratorWrite);
-    autoView(F_v,F,AcceleratorRead);
-    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
-    {
-      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
-      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
-      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
-      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
-    });
-
-    return T;
-  }
-
-  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F.Grid());
-    T = Zero();
-    
-    autoView(T_v, T,AcceleratorWrite);
-    autoView(F_v, F,AcceleratorRead);
-    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
-    {
-      T_v[i]()(0, 1) = -F_v[i]()();
-      T_v[i]()(1, 0) = F_v[i]()();
-      T_v[i]()(2, 3) = -F_v[i]()();
-      T_v[i]()(3, 2) = F_v[i]()();
-    });
-
-    return T;
-  }
-
-  CloverFieldType fillCloverXY(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F.Grid());
-    T = Zero();
-
-    autoView(T_v,T,AcceleratorWrite);
-    autoView(F_v,F,AcceleratorRead);
-    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
-    {
-      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
-      T_v[i]()(1, 1) = timesI(F_v[i]()());
-      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
-      T_v[i]()(3, 3) = timesI(F_v[i]()());
-    });
-
-    return T;
-  }
-
-  CloverFieldType fillCloverXT(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F.Grid());
-    T = Zero();
-
-    autoView( T_v , T, AcceleratorWrite);
-    autoView( F_v , F, AcceleratorRead);
-    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
-    {
-      T_v[i]()(0, 1) = timesI(F_v[i]()());
-      T_v[i]()(1, 0) = timesI(F_v[i]()());
-      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
-      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
-    });
-
-    return T;
-  }
-
-  CloverFieldType fillCloverYT(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F.Grid());
-    T = Zero();
-    
-    autoView( T_v ,T,AcceleratorWrite);
-    autoView( F_v ,F,AcceleratorRead);
-    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
-    {
-      T_v[i]()(0, 1) = -(F_v[i]()());
-      T_v[i]()(1, 0) = (F_v[i]()());
-      T_v[i]()(2, 3) = (F_v[i]()());
-      T_v[i]()(3, 2) = -(F_v[i]()());
-    });
-
-    return T;
-  }
-
-  CloverFieldType fillCloverZT(const GaugeLinkField &F)
-  {
-    CloverFieldType T(F.Grid());
-
-    T = Zero();
-
-    autoView( T_v , T,AcceleratorWrite);
-    autoView( F_v , F,AcceleratorRead);
-    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
-    {
-      T_v[i]()(0, 0) = timesI(F_v[i]()());
-      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
-      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
-      T_v[i]()(3, 3) = timesI(F_v[i]()());
-    });
-
-    return T;
-  }
+  CloverField CloverTerm, CloverTermInv;                     // Clover term
+  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
+  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
+  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
+  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
 };
+
 NAMESPACE_END(Grid);


--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@@ -0,0 +1,761 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
+
+    Copyright (C) 2021 - 2022
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+// Helper routines that implement common clover functionality
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl> class WilsonCloverHelpers {
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+
+  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
+  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
+  {
+    conformable(lambda.Grid(), U[0].Grid());
+    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
+    // insertion in upper staple
+    // please check redundancy of shift operations
+
+    // C1+
+    tmp = lambda * U[nu];
+    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C2+
+    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C3+
+    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
+
+    // C4+
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
+
+    // insertion in lower staple
+    // C1-
+    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C2-
+    tmp = adj(lambda) * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C3-
+    tmp = lambda * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
+
+    // C4-
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
+
+    return out;
+  }
+
+  static CloverField fillCloverYZ(const GaugeLinkField &F)
+  {
+    CloverField T(F.Grid());
+    T = Zero();
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
+    {
+      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
+    });
+
+    return T;
+  }
+
+  static CloverField fillCloverXZ(const GaugeLinkField &F)
+  {
+    CloverField T(F.Grid());
+    T = Zero();
+    
+    autoView(T_v, T,AcceleratorWrite);
+    autoView(F_v, F,AcceleratorRead);
+    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
+    {
+      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
+      coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
+      coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
+      coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
+    });
+
+    return T;
+  }
+
+  static CloverField fillCloverXY(const GaugeLinkField &F)
+  {
+    CloverField T(F.Grid());
+    T = Zero();
+
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
+    {
+      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
+    });
+
+    return T;
+  }
+
+  static CloverField fillCloverXT(const GaugeLinkField &F)
+  {
+    CloverField T(F.Grid());
+    T = Zero();
+
+    autoView( T_v , T, AcceleratorWrite);
+    autoView( F_v , F, AcceleratorRead);
+    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
+    {
+      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
+    });
+
+    return T;
+  }
+
+  static CloverField fillCloverYT(const GaugeLinkField &F)
+  {
+    CloverField T(F.Grid());
+    T = Zero();
+    
+    autoView( T_v ,T,AcceleratorWrite);
+    autoView( F_v ,F,AcceleratorRead);
+    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
+    {
+      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
+      coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
+      coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
+      coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
+    });
+
+    return T;
+  }
+
+  static CloverField fillCloverZT(const GaugeLinkField &F)
+  {
+    CloverField T(F.Grid());
+
+    T = Zero();
+
+    autoView( T_v , T,AcceleratorWrite);
+    autoView( F_v , F,AcceleratorRead);
+    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
+    {
+      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
+      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
+    });
+
+    return T;
+  }
+
+  template<class _Spinor>
+  static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
+    auto CC = coalescedRead(C);
+    mult(&phi, &CC, &chi);
+  }
+
+  template<class _SpinorField>
+  inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
+    const int Nsimd = SiteSpinor::Nsimd();
+    autoView(out_v, out, AcceleratorWrite);
+    autoView(phi_v, phi, AcceleratorRead);
+    autoView(C_v,   C,   AcceleratorRead);
+    typedef decltype(coalescedRead(out_v[0])) calcSpinor;
+    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
+      calcSpinor tmp;
+      multClover(tmp,C_v[sss],phi_v(sss));
+      coalescedWrite(out_v[sss],tmp);
+    });
+  }
+};
+
+
+template<class Impl> class CompactWilsonCloverHelpers {
+public:
+
+  INHERIT_COMPACT_CLOVER_SIZES(Impl);
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  #if 0
+  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
+    assert(i != j);
+    if(i < j) {
+      return triangle()(block)(triangle_index(i, j));
+    } else { // i > j
+      return conjugate(triangle()(block)(triangle_index(i, j)));
+    }
+  }
+  #else
+  template<typename vobj>
+  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
+    assert(i != j);
+    if(i < j) {
+      return triangle()(block)(triangle_index(i, j));
+    } else { // i > j
+      return conjugate(triangle()(block)(triangle_index(i, j)));
+    }
+  }
+  #endif
+
+  static accelerator_inline int triangle_index(int i, int j) {
+    if(i == j)
+      return 0;
+    else if(i < j)
+      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
+    else // i > j
+      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
+  }
+
+  static void MooeeKernel_gpu(int                        Nsite,
+                              int                        Ls,
+                              const FermionField&        in,
+                              FermionField&              out,
+                              const CloverDiagonalField& diagonal,
+                              const CloverTriangleField& triangle) {
+    autoView(diagonal_v, diagonal, AcceleratorRead);
+    autoView(triangle_v, triangle, AcceleratorRead);
+    autoView(in_v,       in,       AcceleratorRead);
+    autoView(out_v,      out,      AcceleratorWrite);
+
+    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
+
+    const uint64_t NN = Nsite * Ls;
+
+    accelerator_for(ss, NN, Simd::Nsimd(), {
+      int sF = ss;
+      int sU = ss/Ls;
+      CalcSpinor res;
+      CalcSpinor in_t = in_v(sF);
+      auto diagonal_t = diagonal_v(sU);
+      auto triangle_t = triangle_v(sU);
+      for(int block=0; block<Nhs; block++) {
+        int s_start = block*Nhs;
+        for(int i=0; i<Nred; i++) {
+          int si = s_start + i/Nc, ci = i%Nc;
+          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
+          for(int j=0; j<Nred; j++) {
+            if (j == i) continue;
+            int sj = s_start + j/Nc, cj = j%Nc;
+            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
+          };
+        };
+      };
+      coalescedWrite(out_v[sF], res);
+    });
+  }
+
+  static void MooeeKernel_cpu(int                        Nsite,
+                              int                        Ls,
+                              const FermionField&        in,
+                              FermionField&              out,
+                              const CloverDiagonalField& diagonal,
+                              const CloverTriangleField& triangle) {
+    autoView(diagonal_v, diagonal, CpuRead);
+    autoView(triangle_v, triangle, CpuRead);
+    autoView(in_v,       in,       CpuRead);
+    autoView(out_v,      out,      CpuWrite);
+
+    typedef SiteSpinor CalcSpinor;
+
+#if defined(A64FX) || defined(A64FXFIXEDSIZE)
+#define PREFETCH_CLOVER(BASE) {                                     \
+    uint64_t base;                                                  \
+    int pf_dist_L1 = 1;                                             \
+    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
+                                                                    \
+    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
+      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
+      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
+    }                                                               \
+                                                                    \
+    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
+      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
+      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
+      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
+    }                                                               \
+  }
+// TODO: Implement/generalize this for other architectures
+// I played around a bit on KNL (see below) but didn't bring anything
+// #elif defined(AVX512)
+// #define PREFETCH_CLOVER(BASE) {                              \
+//     uint64_t base;                                           \
+//     int pf_dist_L1 = 1;                                      \
+//     int pf_dist_L2 = +4;                                     \
+//                                                              \
+//     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
+//       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
+//       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
+//       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
+//     }                                                        \
+//                                                              \
+//     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
+//       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
+//       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
+//       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
+//     }                                                        \
+//   }
+#else
+#define PREFETCH_CLOVER(BASE)
+#endif
+
+    const uint64_t NN = Nsite * Ls;
+
+    thread_for(ss, NN, {
+      int sF = ss;
+      int sU = ss/Ls;
+      CalcSpinor res;
+      CalcSpinor in_t = in_v[sF];
+      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
+      auto triangle_t = triangle_v[sU];
+
+      // upper half
+      PREFETCH_CLOVER(0);
+
+      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
+      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
+      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
+      auto in_cc_1_0 = conjugate(in_t()(1)(0));
+      auto in_cc_1_1 = conjugate(in_t()(1)(1));
+
+      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
+                  +           triangle_t()(0)( 0) * in_t()(0)(1)
+                  +           triangle_t()(0)( 1) * in_t()(0)(2)
+                  +           triangle_t()(0)( 2) * in_t()(1)(0)
+                  +           triangle_t()(0)( 3) * in_t()(1)(1)
+                  +           triangle_t()(0)( 4) * in_t()(1)(2);
+
+      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
+      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
+                  +           triangle_t()(0)( 5) * in_t()(0)(2)
+                  +           triangle_t()(0)( 6) * in_t()(1)(0)
+                  +           triangle_t()(0)( 7) * in_t()(1)(1)
+                  +           triangle_t()(0)( 8) * in_t()(1)(2)
+                  + conjugate(       res()(0)( 1));
+
+      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
+                  +           triangle_t()(0)( 5) * in_cc_0_1;
+      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
+                  +           triangle_t()(0)( 9) * in_t()(1)(0)
+                  +           triangle_t()(0)(10) * in_t()(1)(1)
+                  +           triangle_t()(0)(11) * in_t()(1)(2)
+                  + conjugate(       res()(0)( 2));
+
+      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
+                  +           triangle_t()(0)( 6) * in_cc_0_1
+                  +           triangle_t()(0)( 9) * in_cc_0_2;
+      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
+                  +           triangle_t()(0)(12) * in_t()(1)(1)
+                  +           triangle_t()(0)(13) * in_t()(1)(2)
+                  + conjugate(       res()(1)( 0));
+
+      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
+                  +           triangle_t()(0)( 7) * in_cc_0_1
+                  +           triangle_t()(0)(10) * in_cc_0_2
+                  +           triangle_t()(0)(12) * in_cc_1_0;
+      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
+                  +           triangle_t()(0)(14) * in_t()(1)(2)
+                  + conjugate(       res()(1)( 1));
+
+      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
+                  +           triangle_t()(0)( 8) * in_cc_0_1
+                  +           triangle_t()(0)(11) * in_cc_0_2
+                  +           triangle_t()(0)(13) * in_cc_1_0
+                  +           triangle_t()(0)(14) * in_cc_1_1;
+      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
+                  + conjugate(       res()(1)( 2));
+
+      vstream(out_v[sF]()(0)(0), res()(0)(0));
+      vstream(out_v[sF]()(0)(1), res()(0)(1));
+      vstream(out_v[sF]()(0)(2), res()(0)(2));
+      vstream(out_v[sF]()(1)(0), res()(1)(0));
+      vstream(out_v[sF]()(1)(1), res()(1)(1));
+      vstream(out_v[sF]()(1)(2), res()(1)(2));
+
+      // lower half
+      PREFETCH_CLOVER(1);
+
+      auto in_cc_2_0 = conjugate(in_t()(2)(0));
+      auto in_cc_2_1 = conjugate(in_t()(2)(1));
+      auto in_cc_2_2 = conjugate(in_t()(2)(2));
+      auto in_cc_3_0 = conjugate(in_t()(3)(0));
+      auto in_cc_3_1 = conjugate(in_t()(3)(1));
+
+      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
+                  +           triangle_t()(1)( 0) * in_t()(2)(1)
+                  +           triangle_t()(1)( 1) * in_t()(2)(2)
+                  +           triangle_t()(1)( 2) * in_t()(3)(0)
+                  +           triangle_t()(1)( 3) * in_t()(3)(1)
+                  +           triangle_t()(1)( 4) * in_t()(3)(2);
+
+      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
+      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
+                  +           triangle_t()(1)( 5) * in_t()(2)(2)
+                  +           triangle_t()(1)( 6) * in_t()(3)(0)
+                  +           triangle_t()(1)( 7) * in_t()(3)(1)
+                  +           triangle_t()(1)( 8) * in_t()(3)(2)
+                  + conjugate(       res()(2)( 1));
+
+      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
+                  +           triangle_t()(1)( 5) * in_cc_2_1;
+      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
+                  +           triangle_t()(1)( 9) * in_t()(3)(0)
+                  +           triangle_t()(1)(10) * in_t()(3)(1)
+                  +           triangle_t()(1)(11) * in_t()(3)(2)
+                  + conjugate(       res()(2)( 2));
+
+      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
+                  +           triangle_t()(1)( 6) * in_cc_2_1
+                  +           triangle_t()(1)( 9) * in_cc_2_2;
+      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
+                  +           triangle_t()(1)(12) * in_t()(3)(1)
+                  +           triangle_t()(1)(13) * in_t()(3)(2)
+                  + conjugate(       res()(3)( 0));
+
+      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
+                  +           triangle_t()(1)( 7) * in_cc_2_1
+                  +           triangle_t()(1)(10) * in_cc_2_2
+                  +           triangle_t()(1)(12) * in_cc_3_0;
+      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
+                  +           triangle_t()(1)(14) * in_t()(3)(2)
+                  + conjugate(       res()(3)( 1));
+
+      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
+                  +           triangle_t()(1)( 8) * in_cc_2_1
+                  +           triangle_t()(1)(11) * in_cc_2_2
+                  +           triangle_t()(1)(13) * in_cc_3_0
+                  +           triangle_t()(1)(14) * in_cc_3_1;
+      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
+                  + conjugate(       res()(3)( 2));
+
+      vstream(out_v[sF]()(2)(0), res()(2)(0));
+      vstream(out_v[sF]()(2)(1), res()(2)(1));
+      vstream(out_v[sF]()(2)(2), res()(2)(2));
+      vstream(out_v[sF]()(3)(0), res()(3)(0));
+      vstream(out_v[sF]()(3)(1), res()(3)(1));
+      vstream(out_v[sF]()(3)(2), res()(3)(2));
+    });
+  }
+
+  static void MooeeKernel(int                        Nsite,
+                          int                        Ls,
+                          const FermionField&        in,
+                          FermionField&              out,
+                          const CloverDiagonalField& diagonal,
+                          const CloverTriangleField& triangle) {
+#if defined(GRID_CUDA) || defined(GRID_HIP)
+    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
+#else
+    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
+#endif
+  }
+
+  static void Invert(const CloverDiagonalField& diagonal,
+                     const CloverTriangleField& triangle,
+                     CloverDiagonalField&       diagonalInv,
+                     CloverTriangleField&       triangleInv) {
+    conformable(diagonal, diagonalInv);
+    conformable(triangle, triangleInv);
+    conformable(diagonal, triangle);
+
+    diagonalInv.Checkerboard() = diagonal.Checkerboard();
+    triangleInv.Checkerboard() = triangle.Checkerboard();
+
+    GridBase* grid = diagonal.Grid();
+
+    long lsites = grid->lSites();
+
+    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
+    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
+
+    autoView(diagonal_v,  diagonal,  CpuRead);
+    autoView(triangle_v,  triangle,  CpuRead);
+    autoView(diagonalInv_v, diagonalInv, CpuWrite);
+    autoView(triangleInv_v, triangleInv, CpuWrite);
+
+    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
+      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
+      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
+
+      scalar_object_diagonal diagonal_tmp     = Zero();
+      scalar_object_diagonal diagonal_inv_tmp = Zero();
+      scalar_object_triangle triangle_tmp     = Zero();
+      scalar_object_triangle triangle_inv_tmp = Zero();
+
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(site, lcoor);
+
+      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
+      peekLocalSite(triangle_tmp, triangle_v, lcoor);
+
+      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
+      for (long s_row=0;s_row<Ns;s_row++) {
+        for (long s_col=0;s_col<Ns;s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for (long c_row=0;c_row<Nc;c_row++) {
+            for (long c_col=0;c_col<Nc;c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
+              else
+                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
+            }
+          }
+        }
+      }
+
+      clover_inv_eigen = clover_eigen.inverse();
+
+      for (long s_row=0;s_row<Ns;s_row++) {
+        for (long s_col=0;s_col<Ns;s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for (long c_row=0;c_row<Nc;c_row++) {
+            for (long c_col=0;c_col<Nc;c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
+              else if(i < j)
+                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
+              else
+                continue;
+            }
+          }
+        }
+      }
+
+      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
+      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
+    });
+  }
+
+  static void ConvertLayout(const CloverField&   full,
+                            CloverDiagonalField& diagonal,
+                            CloverTriangleField& triangle) {
+    conformable(full, diagonal);
+    conformable(full, triangle);
+
+    diagonal.Checkerboard() = full.Checkerboard();
+    triangle.Checkerboard() = full.Checkerboard();
+
+    autoView(full_v,     full,     AcceleratorRead);
+    autoView(diagonal_v, diagonal, AcceleratorWrite);
+    autoView(triangle_v, triangle, AcceleratorWrite);
+
+    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
+    accelerator_for(ss, full.Grid()->oSites(), 1, {
+      for(int s_row = 0; s_row < Ns; s_row++) {
+        for(int s_col = 0; s_col < Ns; s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for(int c_row = 0; c_row < Nc; c_row++) {
+            for(int c_col = 0; c_col < Nc; c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
+              else if(i < j)
+                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
+              else
+                continue;
+            }
+          }
+        }
+      }
+    });
+  }
+
+
+  static void ConvertLayout(const CloverDiagonalField& diagonal,
+                            const CloverTriangleField& triangle,
+                            CloverField&               full) {
+    conformable(full, diagonal);
+    conformable(full, triangle);
+
+    full.Checkerboard() = diagonal.Checkerboard();
+
+    full = Zero();
+
+    autoView(diagonal_v, diagonal, AcceleratorRead);
+    autoView(triangle_v, triangle, AcceleratorRead);
+    autoView(full_v,     full,     AcceleratorWrite);
+
+    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
+    accelerator_for(ss, full.Grid()->oSites(), 1, {
+      for(int s_row = 0; s_row < Ns; s_row++) {
+        for(int s_col = 0; s_col < Ns; s_col++) {
+          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
+          int block       = s_row / Nhs;
+          int s_row_block = s_row % Nhs;
+          int s_col_block = s_col % Nhs;
+          for(int c_row = 0; c_row < Nc; c_row++) {
+            for(int c_col = 0; c_col < Nc; c_col++) {
+              int i = s_row_block * Nc + c_row;
+              int j = s_col_block * Nc + c_col;
+              if(i == j)
+                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
+              else
+                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
+            }
+          }
+        }
+      }
+    });
+  }
+
+  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
+    // Checks/grid
+    double t0 = usecond();
+    conformable(diagonal, triangle);
+    GridBase* grid = diagonal.Grid();
+
+    // Determine the boundary coordinates/sites
+    double t1 = usecond();
+    int t_dir = Nd - 1;
+    Lattice<iScalar<vInteger>> t_coor(grid);
+    LatticeCoordinate(t_coor, t_dir);
+    int T = grid->GlobalDimensions()[t_dir];
+
+    // Set off-diagonal parts at boundary to zero -- OK
+    double t2 = usecond();
+    CloverTriangleField zeroTriangle(grid);
+    zeroTriangle.Checkerboard() = triangle.Checkerboard();
+    zeroTriangle = Zero();
+    triangle = where(t_coor == 0,   zeroTriangle, triangle);
+    triangle = where(t_coor == T-1, zeroTriangle, triangle);
+
+    // Set diagonal to unity (scaled correctly) -- OK
+    double t3 = usecond();
+    CloverDiagonalField tmp(grid);
+    tmp.Checkerboard() = diagonal.Checkerboard();
+    tmp                = -1.0 * csw_t + diag_mass;
+    diagonal           = where(t_coor == 0,   tmp, diagonal);
+    diagonal           = where(t_coor == T-1, tmp, diagonal);
+
+    // Correct values next to boundary
+    double t4 = usecond();
+    if(cF != 1.0) {
+      tmp = cF - 1.0;
+      tmp += diagonal;
+      diagonal = where(t_coor == 1,   tmp, diagonal);
+      diagonal = where(t_coor == T-2, tmp, diagonal);
+    }
+
+    // Report timings
+    double t5 = usecond();
+#if 0
+    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
+              << " checks = "          << (t1 - t0) / 1e6
+              << ", coordinate = "     << (t2 - t1) / 1e6
+              << ", off-diag zero = "  << (t3 - t2) / 1e6
+              << ", diagonal unity = " << (t4 - t3) / 1e6
+              << ", near-boundary = "  << (t5 - t4) / 1e6
+              << ", total = "          << (t5 - t0) / 1e6
+              << std::endl;
+#endif
+  }
+
+  template<class Field, class Mask>
+  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
+    conformable(f, m);
+    auto grid  = f.Grid();
+    const uint32_t Nsite = grid->oSites();
+    const uint32_t Nsimd = grid->Nsimd();
+    autoView(f_v, f, AcceleratorWrite);
+    autoView(m_v, m, AcceleratorRead);
+    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
+    accelerator_for(ss, Nsite, Nsimd, {
+      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
+    });
+  }
+
+  template<class MaskField>
+  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
+    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
+    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
+    assert(!full.Grid()->_isCheckerBoarded);
+
+    GridBase* grid = full.Grid();
+    int t_dir = Nd-1;
+    Lattice<iScalar<vInteger>> t_coor(grid);
+    LatticeCoordinate(t_coor, t_dir);
+    int T = grid->GlobalDimensions()[t_dir];
+
+    MaskField zeroMask(grid); zeroMask = Zero();
+    full = 1.0;
+    full = where(t_coor == 0,   zeroMask, full);
+    full = where(t_coor == T-1, zeroMask, full);
+
+    pickCheckerboard(Even, even, full);
+    pickCheckerboard(Odd,  odd,  full);
+  }
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@@ -0,0 +1,92 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
+
+    Copyright (C) 2021 - 2022
+
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl>
+class WilsonCloverTypes {
+public:
+  INHERIT_IMPL_TYPES(Impl);
+
+  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+
+  typedef iImplClover<Simd> SiteClover;
+
+  typedef Lattice<SiteClover> CloverField;
+};
+
+template<class Impl>
+class CompactWilsonCloverTypes {
+public:
+  INHERIT_IMPL_TYPES(Impl);
+
+  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
+
+  static constexpr int Nred      = Nc * Nhs;        // 6
+  static constexpr int Nblock    = Nhs;             // 2
+  static constexpr int Ndiagonal = Nred;            // 6
+  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
+
+  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
+  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
+
+  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
+  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
+  typedef iSinglet<Simd>            SiteMask;
+
+  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
+  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
+  typedef Lattice<SiteMask>           MaskField;
+};
+
+#define INHERIT_CLOVER_TYPES(Impl)                                 \
+  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
+  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
+
+#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
+  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
+  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
+  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
+  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
+  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
+  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
+  /* ugly duplication but needed inside functionality classes */ \
+  template<typename vtype> using iImplCloverDiagonal = \
+    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
+  template<typename vtype> using iImplCloverTriangle = \
+    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
+
+#define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
+  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
+  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
+  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
+  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
+  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
+    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
+      unsigned int t0 = 0;
+      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
+    } else {
+      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
+    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated

    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -0,0 +1,363 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
+
+    Copyright (C) 2017 - 2022
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+template<class Impl>
+CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
+                                                             GridCartesian& Fgrid,
+                                                             GridRedBlackCartesian& Hgrid,
+                                                             const RealD _mass,
+                                                             const RealD _csw_r,
+                                                             const RealD _csw_t,
+                                                             const RealD _cF,
+                                                             const WilsonAnisotropyCoefficients& clover_anisotropy,
+                                                             const ImplParams& impl_p)
+  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
+  , csw_r(_csw_r)
+  , csw_t(_csw_t)
+  , cF(_cF)
+  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , Diagonal(&Fgrid),        Triangle(&Fgrid)
+  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
+  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
+  , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
+  , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
+  , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
+  , Tmp(&Fgrid)
+  , BoundaryMask(&Fgrid)
+  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
+{
+  csw_r *= 0.5;
+  csw_t *= 0.5;
+  if (clover_anisotropy.isAnisotropic)
+    csw_r /= clover_anisotropy.xi_0;
+
+  ImportGauge(_Umu);
+  if (open_boundaries)
+    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::Dhop(in, out, dag);
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopOE(in, out, dag);
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+  WilsonBase::DhopEO(in, out, dag);
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+  WilsonBase::DhopDir(in, out, dir, disp);
+  if(this->open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+  WilsonBase::DhopDirAll(in, out);
+  if(this->open_boundaries) {
+    for(auto& o : out) ApplyBoundaryMask(o);
+  }
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
+  Mooee(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
+  out.Checkerboard() = in.Checkerboard();
+  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
+  MooeeDag(in, Tmp);
+  axpy(out, 1.0, out, Tmp);
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
+  WilsonBase::Meooe(in, out);
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
+  WilsonBase::MeooeDag(in, out);
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalEven, TriangleEven);
+    }
+  } else {
+    MooeeInternal(in, out, Diagonal, Triangle);
+  }
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
+  Mooee(in, out); // blocks are hermitian
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
+  if(in.Grid()->_isCheckerBoarded) {
+    if(in.Checkerboard() == Odd) {
+      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
+    } else {
+      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
+    }
+  } else {
+    MooeeInternal(in, out, DiagonalInv, TriangleInv);
+  }
+  if(open_boundaries) ApplyBoundaryMask(out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
+  MooeeInv(in, out); // blocks are hermitian
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+  DhopDirAll(in, out);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+  assert(!open_boundaries); // TODO check for changes required for open bc
+
+  // NOTE: code copied from original clover term
+  conformable(X.Grid(), Y.Grid());
+  conformable(X.Grid(), force.Grid());
+  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+  GaugeField clover_force(force.Grid());
+  PropagatorField Lambda(force.Grid());
+
+  // Guido: Here we are hitting some performance issues:
+  // need to extract the components of the DoubledGaugeField
+  // for each call
+  // Possible solution
+  // Create a vector object to store them? (cons: wasting space)
+  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+  Impl::extractLinkField(U, this->Umu);
+
+  force = Zero();
+  // Derivative of the Wilson hopping term
+  this->DhopDeriv(force, X, Y, dag);
+
+  ///////////////////////////////////////////////////////////
+  // Clover term derivative
+  ///////////////////////////////////////////////////////////
+  Impl::outerProductImpl(Lambda, X, Y);
+  //std::cout << "Lambda:" << Lambda << std::endl;
+
+  Gamma::Algebra sigma[] = {
+      Gamma::Algebra::SigmaXY,
+      Gamma::Algebra::SigmaXZ,
+      Gamma::Algebra::SigmaXT,
+      Gamma::Algebra::MinusSigmaXY,
+      Gamma::Algebra::SigmaYZ,
+      Gamma::Algebra::SigmaYT,
+      Gamma::Algebra::MinusSigmaXZ,
+      Gamma::Algebra::MinusSigmaYZ,
+      Gamma::Algebra::SigmaZT,
+      Gamma::Algebra::MinusSigmaXT,
+      Gamma::Algebra::MinusSigmaYT,
+      Gamma::Algebra::MinusSigmaZT};
+
+  /*
+    sigma_{\mu \nu}=
+    | 0         sigma[0]  sigma[1]  sigma[2] |
+    | sigma[3]    0       sigma[4]  sigma[5] |
+    | sigma[6]  sigma[7]     0      sigma[8] |
+    | sigma[9]  sigma[10] sigma[11]   0      |
+  */
+
+  int count = 0;
+  clover_force = Zero();
+  for (int mu = 0; mu < 4; mu++)
+  {
+    force_mu = Zero();
+    for (int nu = 0; nu < 4; nu++)
+    {
+      if (mu == nu)
+        continue;
+
+      RealD factor;
+      if (nu == 4 || mu == 4)
+      {
+        factor = 2.0 * csw_t;
+      }
+      else
+      {
+        factor = 2.0 * csw_r;
+      }
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
+      count++;
+    }
+
+    pokeLorentz(clover_force, U[mu] * force_mu, mu);
+  }
+  //clover_force *= csw;
+  force += clover_force;
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+  assert(0);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
+                    FermionField&              out,
+                    const CloverDiagonalField& diagonal,
+                    const CloverTriangleField& triangle) {
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  out.Checkerboard() = in.Checkerboard();
+  conformable(in, out);
+  conformable(in, diagonal);
+  conformable(in, triangle);
+
+  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
+}
+
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
+  // NOTE: parts copied from original implementation
+
+  // Import gauge into base class
+  double t0 = usecond();
+  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
+
+  // Initialize temporary variables
+  double t1 = usecond();
+  conformable(_Umu.Grid(), this->GaugeGrid());
+  GridBase* grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+  CloverField TmpOriginal(grid);
+
+  // Compute the field strength terms mu>nu
+  double t2 = usecond();
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  double t3 = usecond();
+  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
+  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
+  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
+  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
+  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
+  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
+  TmpOriginal += this->diag_mass;
+
+  // Convert the data layout of the clover term
+  double t4 = usecond();
+  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+
+  // Possible modify the boundary values
+  double t5 = usecond();
+  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
+
+  // Invert the clover term in the improved layout
+  double t6 = usecond();
+  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
+
+  // Fill the remaining clover fields
+  double t7 = usecond();
+  pickCheckerboard(Even, DiagonalEven,    Diagonal);
+  pickCheckerboard(Even, TriangleEven,    Triangle);
+  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
+  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
+  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
+  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
+  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
+  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
+
+  // Report timings
+  double t8 = usecond();
+#if 0
+  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
+            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+            << ", allocations = "               << (t2 - t1) / 1e6
+            << ", field strength = "            << (t3 - t2) / 1e6
+            << ", fill clover = "               << (t4 - t3) / 1e6
+            << ", convert = "                   << (t5 - t4) / 1e6
+            << ", boundaries = "                << (t6 - t5) / 1e6
+            << ", inversions = "                << (t7 - t6) / 1e6
+            << ", pick cbs = "                  << (t8 - t7) / 1e6
+            << ", total = "                     << (t8 - t0) / 1e6
+            << std::endl;
+#endif
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -2,12 +2,13 @@

    Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h

-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022

    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,6 +34,45 @@

 NAMESPACE_BEGIN(Grid);

+template<class Impl>
+WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
+                                               GridCartesian&                      Fgrid,
+                                               GridRedBlackCartesian&              Hgrid,
+                                               const RealD                         _mass,
+                                               const RealD                         _csw_r,
+                                               const RealD                         _csw_t,
+                                               const WilsonAnisotropyCoefficients& clover_anisotropy,
+                                               const ImplParams&                   impl_p)
+  : WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
+  , CloverTerm(&Fgrid)
+  , CloverTermInv(&Fgrid)
+  , CloverTermEven(&Hgrid)
+  , CloverTermOdd(&Hgrid)
+  , CloverTermInvEven(&Hgrid)
+  , CloverTermInvOdd(&Hgrid)
+  , CloverTermDagEven(&Hgrid)
+  , CloverTermDagOdd(&Hgrid)
+  , CloverTermInvDagEven(&Hgrid)
+  , CloverTermInvDagOdd(&Hgrid) {
+  assert(Nd == 4); // require 4 dimensions
+
+  if(clover_anisotropy.isAnisotropic) {
+    csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
+    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
+  } else {
+    csw_r     = _csw_r * 0.5;
+    diag_mass = 4.0 + _mass;
+  }
+  csw_t = _csw_t * 0.5;
+
+  if(csw_r == 0)
+    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
+  if(csw_t == 0)
+    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
+
+  ImportGauge(_Umu);
+}
+
 // *NOT* EO
 template <class Impl>
 void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
@@ -67,10 +107,13 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 template <class Impl>
 void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
+  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
+  double t1 = usecond();
  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);

+  double t2 = usecond();
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@@ -79,19 +122,22 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);

+  double t3 = usecond();
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm  = Helpers::fillCloverYZ(Bx) * csw_r;
+  CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
+  CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
+  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
+  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
+  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;

+  double t4 = usecond();
  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;

+  double t5 = usecond();
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
@@ -100,7 +146,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
+      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
@@ -125,6 +171,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
    });
  }

+  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -137,6 +184,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)

  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+  double t7 = usecond();
+
+#if 0
+  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
+            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+            << ", allocations = "               << (t2 - t1) / 1e6
+            << ", field strength = "            << (t3 - t2) / 1e6
+            << ", fill clover = "               << (t4 - t3) / 1e6
+            << ", misc = "                      << (t5 - t4) / 1e6
+            << ", inversions = "                << (t6 - t5) / 1e6
+            << ", pick cbs = "                  << (t7 - t6) / 1e6
+            << ", total = "                     << (t7 - t0) / 1e6
+            << std::endl;
+#endif
 }

 template <class Impl>
@@ -167,7 +228,7 @@ template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
+  CloverField *Clover;
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);

  if (dag)
@@ -182,12 +243,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
+      Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
    }
  }
  else
@@ -205,18 +266,98 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
  }
-
 } // MooeeInternal

+// Derivative parts unpreconditioned pseudofermions
+template <class Impl>
+void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+{
+  conformable(X.Grid(), Y.Grid());
+  conformable(X.Grid(), force.Grid());
+  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+  GaugeField clover_force(force.Grid());
+  PropagatorField Lambda(force.Grid());
+
+  // Guido: Here we are hitting some performance issues:
+  // need to extract the components of the DoubledGaugeField
+  // for each call
+  // Possible solution
+  // Create a vector object to store them? (cons: wasting space)
+  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+  Impl::extractLinkField(U, this->Umu);
+
+  force = Zero();
+  // Derivative of the Wilson hopping term
+  this->DhopDeriv(force, X, Y, dag);
+
+  ///////////////////////////////////////////////////////////
+  // Clover term derivative
+  ///////////////////////////////////////////////////////////
+  Impl::outerProductImpl(Lambda, X, Y);
+  //std::cout << "Lambda:" << Lambda << std::endl;
+
+  Gamma::Algebra sigma[] = {
+      Gamma::Algebra::SigmaXY,
+      Gamma::Algebra::SigmaXZ,
+      Gamma::Algebra::SigmaXT,
+      Gamma::Algebra::MinusSigmaXY,
+      Gamma::Algebra::SigmaYZ,
+      Gamma::Algebra::SigmaYT,
+      Gamma::Algebra::MinusSigmaXZ,
+      Gamma::Algebra::MinusSigmaYZ,
+      Gamma::Algebra::SigmaZT,
+      Gamma::Algebra::MinusSigmaXT,
+      Gamma::Algebra::MinusSigmaYT,
+      Gamma::Algebra::MinusSigmaZT};
+
+  /*
+    sigma_{\mu \nu}=
+    | 0         sigma[0]  sigma[1]  sigma[2] |
+    | sigma[3]    0       sigma[4]  sigma[5] |
+    | sigma[6]  sigma[7]     0      sigma[8] |
+    | sigma[9]  sigma[10] sigma[11]   0      |
+  */
+
+  int count = 0;
+  clover_force = Zero();
+  for (int mu = 0; mu < 4; mu++)
+  {
+    force_mu = Zero();
+    for (int nu = 0; nu < 4; nu++)
+    {
+      if (mu == nu)
+      continue;
+
+      RealD factor;
+      if (nu == 4 || mu == 4)
+      {
+        factor = 2.0 * csw_t;
+      }
+      else
+      {
+        factor = 2.0 * csw_r;
+      }
+      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
+      count++;
+    }
+
+    pokeLorentz(clover_force, U[mu] * force_mu, mu);
+  }
+  //clover_force *= csw;
+  force += clover_force;
+}

 // Derivative parts
 template <class Impl>
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER

 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -371,10 +371,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;

 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
+  {int ptype;					\
   SE=st.GetEntry(ptype,DIR,ss);		\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
+   auto offset = SE->_offset;			\
+   auto local  = SE->_is_local;			\
+   auto perm   = SE->_permute;			\
   if ( local ) {				\
     LOAD_CHIMU(PERM);				\
     PROJ;					\
@@ -386,14 +387,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   }						\
   acceleratorSynchronise();			\
   MULT_2SPIN(DIR);				\
-  RECON;					
+   RECON;					}

 #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
-  SE=&st_p[DIR+8*ss];				\
-  ptype=st_perm[DIR];				\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
+  { SE=&st_p[DIR+8*ss];						\
+  auto ptype=st_perm[DIR];					\
+  auto offset = SE->_offset;					\
+  auto local  = SE->_is_local;					\
+  auto perm   = SE->_permute;					\
  if ( local ) {						\
    LOAD_CHIMU(PERM);						\
    PROJ;							\
@@ -405,24 +406,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  }								\
  acceleratorSynchronise();					\
  MULT_2SPIN(DIR);						\
-  RECON;					
+  RECON;					}

 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
+  { SE=&st_p[DIR+8*ss];							\
+    auto ptype=st_perm[DIR];						\
    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
-  perm   = SE->_permute;			\
+    auto offset = SE->_offset;						\
+    auto perm   = SE->_permute;						\
    LOAD_CHIMU(PERM);							\
    PROJ;								\
    MULT_2SPIN(DIR);							\
-  RECON;					
+    RECON;					}

 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
+  { int ptype;						\
  SE=st.GetEntry(ptype,DIR,ss);				\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
+  auto offset = SE->_offset;					\
+  auto local  = SE->_is_local;					\
+  auto perm   = SE->_permute;					\
  if ( local ) {						\
    LOAD_CHIMU(PERM);						\
    PROJ;							\
@@ -437,18 +439,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    MULT_2SPIN(DIR);						\
    RECON;							\
  }								\
-  acceleratorSynchronise();			
+  acceleratorSynchronise();			}

 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
+  { int ptype;						\
  SE=st.GetEntry(ptype,DIR,ss);				\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;				\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
    LOAD_CHI;						\
    MULT_2SPIN(DIR);					\
    RECON;						\
    nmu++;						\
  }							\
-  acceleratorSynchronise();			
+  acceleratorSynchronise();			}

 #define HAND_RESULT(ss)					\
  {							\
@@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,

  HAND_DECLARATIONS(Simt);

-  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site

  HAND_DECLARATIONS(Simt);

-  int offset,local,perm, ptype;
  StencilEntry *SE;
-
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si

  HAND_DECLARATIONS(Simt);

-  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
-  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si

  HAND_DECLARATIONS(Simt);

-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_p = st._entries_p;						
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1,41 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+
+    Copyright (C) 2017 - 2022
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
+#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class CompactWilsonCloverFermion<IMPLEMENTATION>; 
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
@@ -0,0 +1 @@
+../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
@@ -0,0 +1 @@
+../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -40,7 +40,7 @@ EOF

 done

-CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"

 for impl in $WILSON_IMPL_LIST
 do
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -69,6 +69,11 @@ public:
    return PeriodicBC::ShiftStaple(Link,mu);
  }

+  //Same as Cshift for periodic BCs
+  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
+    return PeriodicBC::CshiftLink(Link,mu,shift);
+  }
+
  static inline bool isPeriodicGaugeField(void) { return true; }
 };

@@ -110,6 +115,11 @@ public:
      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }

+  //If mu is a conjugate BC direction
+  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
+  //       = U^T_\mu(L-1)  | x_\mu == 0
+  //else
+  //Out(x) = U^dag_\mu(x-mu mod L)
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
  {
@@ -129,6 +139,13 @@ public:
      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }

+
+  //If mu is a conjugate BC direction
+  //Out(x) = S_\mu(x+mu)  | x_\mu != L-1
+  //       = S*_\mu(x+mu)  | x_\mu == L-1
+  //else
+  //Out(x) = S_\mu(x+mu mod L)
+  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
  {
    assert(_conjDirs.size() == Nd);
@@ -138,6 +155,27 @@ public:
      return PeriodicBC::ShiftStaple(Link,mu);
  }

+  //Boundary-aware C-shift of gauge links / gauge transformation matrices
+  //For conjugate BC direction
+  //shift = 1
+  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
+  //       = U*_\mu(0)  | x_\mu == L-1
+  //shift = -1
+  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
+  //       = U*_\mu(L-1)  | x_\mu == 0
+  //else
+  //shift = 1
+  //Out(x) = U_\mu(x+\hat\mu mod L)
+  //shift = -1
+  //Out(x) = U_\mu(x-\hat\mu mod L)
+  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CshiftLink(Link,mu,shift);
+    else     
+      return PeriodicBC::CshiftLink(Link,mu,shift);
+  }
+
  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@@ -40,13 +40,66 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }

+    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
+       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
+       for noise X (aka GaussNoise).
+       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
+    */
+    template<class Field> void InversePowerBoundsCheck(int inv_pow,
+						       int MaxIter,double tol,
+						       LinearOperatorBase<Field> &HermOp,
+						       Field &GaussNoise,
+						       MultiShiftFunction &ApproxNegPow) 
+    {
+      GridBase *FermionGrid = GaussNoise.Grid();
+
+      Field X(FermionGrid);
+      Field Y(FermionGrid);
+      Field Z(FermionGrid);
+
+      Field tmp1(FermionGrid), tmp2(FermionGrid);
+
+      X=GaussNoise;
+      RealD Nx = norm2(X);
+
+      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
+
+      tmp1 = X;
+      
+      Field* in = &tmp1;
+      Field* out = &tmp2;
+      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
+	msCG(HermOp, *in, *out); //backwards conventions!
+	if(i!=inv_pow-1) std::swap(in, out);
+      }
+      Z = *out;
+
+      RealD Nz = norm2(Z);
+
+      HermOp.HermOp(Z,Y);
+      RealD Ny = norm2(Y);
+
+      X=X-Y;
+      RealD Nd = norm2(X);
+      std::cout << "************************* "<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
+      std::cout << "************************* "<<std::endl;
+      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
+    }
+
+
 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -44,6 +44,10 @@ NAMESPACE_BEGIN(Grid);
  // Exact one flavour implementation of DWF determinant ratio //
  ///////////////////////////////////////////////////////////////

+  //Note: using mixed prec CG for the heatbath solver in this action class will not work
+  //      because the L, R operators must have their shift coefficients updated throughout the heatbath step
+  //      You will find that the heatbath solver simply won't converge.
+  //      To use mixed precision here use the ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction variant below
  template<class Impl>
  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
  {
@@ -57,37 +61,60 @@ NAMESPACE_BEGIN(Grid);
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> SolverHB;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBL;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBR;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverR;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverR;
      FermionField Phi; // the pseudofermion field for this trajectory

+      RealD norm2_eta; //|eta|^2 where eta is the random gaussian field used to generate the pseudofermion field
+      bool initial_action; //true for the first call to S after refresh, for which the identity S = |eta|^2 holds provided the rational approx is good
    public:

+      //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
+      virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
+	AbstractEOFAFermion<Impl>&op = LorR == 0 ? Lop : Rop;
+	op.RefreshShiftCoefficients(to);
+      }
+
+
+      //Use the same solver for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& CG, 
 					      Params& p, 
 					      bool use_fc=false) 
-	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,p,use_fc) {};
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,CG,p,use_fc) {};

+      //Use the same solver for L,R in the heatbath but different solvers elsewhere
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& HeatbathCG,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
+					      bool use_fc=false)
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,HeatbathCG,HeatbathCG, ActionCGL, ActionCGR, DerivCGL,DerivCGR,p,use_fc) {};
+
+      //Use different solvers for L,R in all cases
+      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
+					      AbstractEOFAFermion<Impl>& _Rop,
+					      OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
+					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
+					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
+					      Params& p, 
 					      bool use_fc=false) : 
        Lop(_Lop), 
 	Rop(_Rop), 
-	SolverHB(HeatbathCG,false,true),
+	SolverHBL(HeatbathCGL,false,true), SolverHBR(HeatbathCGR,false,true),
 	SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), 
 	DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), 
 	Phi(_Lop.FermionGrid()), 
 	param(p), 
-        use_heatbath_forecasting(use_fc)
+	use_heatbath_forecasting(use_fc),
+	initial_action(false)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);

@@ -97,6 +124,8 @@ NAMESPACE_BEGIN(Grid);
        PowerNegHalf.Init(remez, param.tolerance, true);
      };

+      const FermionField &getPhi() const{ return Phi; }
+
      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }

      virtual std::string LogParameters() {
@@ -117,6 +146,19 @@ NAMESPACE_BEGIN(Grid);
        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
      }

+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+        // P(eta_o) = e^{- eta_o^dag eta_o}
+        //
+        // e^{x^2/2 sig^2} => sig^2 = 0.5.
+        // 
+        RealD scale = std::sqrt(0.5);
+
+        FermionField eta    (Lop.FermionGrid());
+        gaussian(pRNG,eta); eta = eta * scale;
+
+	refresh(U,eta);
+      }
+
      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
@@ -124,12 +166,10 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
-      {
+     void refresh(const GaugeField &U, const FermionField &eta) {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);

-        FermionField eta         (Lop.FermionGrid());
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        FermionField Forecast_src(Lop.FermionGrid());
@@ -140,11 +180,6 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;

-        // Seed with Gaussian noise vector (var = 0.5)
-        RealD scale = std::sqrt(0.5);
-        gaussian(pRNG,eta);
-        eta = eta * scale;
-
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
@@ -160,15 +195,16 @@ NAMESPACE_BEGIN(Grid);
        tmp[1] = Zero();
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Lop.RefreshShiftCoefficients(-gamma_l);
+          heatbathRefreshShiftCoefficients(0, -gamma_l);
+	  //Lop.RefreshShiftCoefficients(-gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            SolverHB(Lop, CG_src, CG_soln);
+            SolverHBL(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero(); // Just use zero as the initial guess
-            SolverHB(Lop, CG_src, CG_soln);
+	    SolverHBL(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
@@ -187,15 +223,16 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
+	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
+          //Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero();
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
@@ -205,49 +242,119 @@ NAMESPACE_BEGIN(Grid);
        Phi = Phi + tmp[1];

        // Reset shift coefficients for energy and force evals
-        Lop.RefreshShiftCoefficients(0.0);
-        Rop.RefreshShiftCoefficients(-1.0);
+        //Lop.RefreshShiftCoefficients(0.0);
+        //Rop.RefreshShiftCoefficients(-1.0);
+	heatbathRefreshShiftCoefficients(0, 0.0);
+	heatbathRefreshShiftCoefficients(1, -1.0);
+
+	//Mark that the next call to S is the first after refresh
+	initial_action = true;
+

 	// Bounds check
 	RealD EtaDagEta = norm2(eta);
+	norm2_eta = EtaDagEta;
+
 	//	RealD PhiDagMPhi= norm2(eta);

      };

-      void Meofa(const GaugeField& U,const FermionField &phi, FermionField & Mphi) 
+      void Meofa(const GaugeField& U,const FermionField &in, FermionField & out) 
      {
-#if 0
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);

-        FermionField spProj_Phi(Lop.FermionGrid());
-	FermionField mPhi(Lop.FermionGrid());
+        FermionField spProj_in(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-	mPhi = phi;
+	out = in;
 	
        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, -1, Lop.Ls);
-        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        spProj(in, spProj_in, -1, Lop.Ls);
+        Lop.Omega(spProj_in, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverL(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
-	mPhi = mPhi -  Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], -1, Lop.Ls);
+
+	out = out -  Lop.k * tmp[1];

        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, 1, Rop.Ls);
-        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
+        spProj(in, spProj_in, 1, Rop.Ls);
+        Rop.Omega(spProj_in, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverR(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
-        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
-#endif
+	spProj(tmp[0], tmp[1], 1, Rop.Ls);
+
+        out = out + Rop.k * tmp[1];
      }

+      //Due to the structure of EOFA, it is no more expensive to compute the inverse of Meofa
+      //To ensure correctness we can simply reuse the heatbath code but use the rational approx
+      //f(x) = 1/x   which corresponds to alpha_0=0,  alpha_1=1,  beta_1=0 => gamma_1=1
+      void MeofaInv(const GaugeField &U, const FermionField &in, FermionField &out) {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField CG_src      (Lop.FermionGrid());
+        FermionField CG_soln     (Lop.FermionGrid());
+        std::vector<FermionField> tmp(2, Lop.FermionGrid());
+
+        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
+	// = 1 * \eta
+        out = in;
+
+        // LH terms:
+        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
+        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
+        spProj(in, tmp[0], -1, Lop.Ls);
+        Lop.Omega(tmp[0], tmp[1], -1, 0);
+        G5R5(CG_src, tmp[1]);
+        {
+          heatbathRefreshShiftCoefficients(0, -1.); //-gamma_1 = -1.
+
+	  CG_soln = Zero(); // Just use zero as the initial guess
+	  SolverHBL(Lop, CG_src, CG_soln);
+
+          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = Lop.k * tmp[0];
+        }
+        Lop.Omega(tmp[1], tmp[0], -1, 1);
+        spProj(tmp[0], tmp[1], -1, Lop.Ls);
+        out = out + tmp[1];
+
+        // RH terms:
+        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
+        //          - \beta_l\gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
+        spProj(in, tmp[0], 1, Rop.Ls);
+        Rop.Omega(tmp[0], tmp[1], 1, 0);
+        G5R5(CG_src, tmp[1]);
+        {
+	  heatbathRefreshShiftCoefficients(1, 0.); //-gamma_1 * beta_1 = 0
+
+	  CG_soln = Zero();
+	  SolverHBR(Rop, CG_src, CG_soln);
+
+          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = - Rop.k * tmp[0];
+        }
+        Rop.Omega(tmp[1], tmp[0], 1, 1);
+        spProj(tmp[0], tmp[1], 1, Rop.Ls);
+        out = out + tmp[1];
+
+        // Reset shift coefficients for energy and force evals
+	heatbathRefreshShiftCoefficients(0, 0.0);
+	heatbathRefreshShiftCoefficients(1, -1.0);
+      };
+
+
+
+
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
@@ -271,7 +378,7 @@ NAMESPACE_BEGIN(Grid);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();

        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
@@ -281,6 +388,26 @@ NAMESPACE_BEGIN(Grid);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();

+	if(initial_action){
+	  //For the first call to S after refresh,  S = |eta|^2. We can use this to ensure the rational approx is good
+	  RealD diff = action - norm2_eta;
+
+	  //S_init = eta^dag M^{-1/2} M M^{-1/2} eta
+	  //S_init - eta^dag eta =  eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta
+
+	  //If approximate solution
+	  //S_init - eta^dag eta =  eta^dag ( [M^{-1/2}+\delta M^{-1/2}] M [M^{-1/2}+\delta M^{-1/2}] - 1 ) eta
+	  //               \approx  eta^dag ( \delta M^{-1/2} M^{1/2} + M^{1/2}\delta M^{-1/2} ) eta
+	  // We divide out |eta|^2 to remove source scaling but the tolerance on this check should still be somewhat higher than the actual approx tolerance
+	  RealD test = fabs(diff)/norm2_eta; //test the quality of the rational approx
+
+	  std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl;
+	  std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << "  expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl;
+
+	  assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
+	  initial_action = false;
+	}
+
        return action;
      };

@@ -329,6 +456,40 @@ NAMESPACE_BEGIN(Grid);
      };
  };

+  template<class ImplD, class ImplF>
+  class ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction : public ExactOneFlavourRatioPseudoFermionAction<ImplD>{
+  public:
+    INHERIT_IMPL_TYPES(ImplD);
+    typedef OneFlavourRationalParams Params;
+
+  private:
+    AbstractEOFAFermion<ImplF>& LopF; // the basic LH operator
+    AbstractEOFAFermion<ImplF>& RopF; // the basic RH operator
+
+  public:
+    
+    virtual std::string action_name() { return "ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction"; }
+    
+    //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
+    virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
+      AbstractEOFAFermion<ImplF> &op = LorR == 0 ? LopF : RopF;
+      op.RefreshShiftCoefficients(to);
+      this->ExactOneFlavourRatioPseudoFermionAction<ImplD>::heatbathRefreshShiftCoefficients(LorR,to);
+    }
+    
+    ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction(AbstractEOFAFermion<ImplF>& _LopF, 
+							     AbstractEOFAFermion<ImplF>& _RopF,
+							     AbstractEOFAFermion<ImplD>& _LopD, 
+							     AbstractEOFAFermion<ImplD>& _RopD,
+							     OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
+							     OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
+							     OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
+							     Params& p, 
+							     bool use_fc=false) : 
+    LopF(_LopF), RopF(_RopF), ExactOneFlavourRatioPseudoFermionAction<ImplD>(_LopD, _RopD, HeatbathCGL, HeatbathCGR, ActionCGL, ActionCGR, DerivCGL, DerivCGR, p, use_fc){}
+  };
+
+
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -0,0 +1,372 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators
+    /////////////////////////////////////////////////////////
+
+    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
+           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
+	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
+	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
+
+	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    
+       BIG WARNING:	   
+       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
+       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
+       Thus for DWF the numerator operator is the Pauli-Villars operator
+
+       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
+       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
+    */
+      
+    template<class Impl>
+    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef RationalActionParams Params;
+      Params param;
+
+      //For action evaluation
+      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
+
+      //For the MD integration
+      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+
+      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
+      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
+      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
+	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
+	double error = remez.generateApprox(approx_degree,1,inv_pow);	
+	if(error > CG_tolerance)
+	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
+	
+	approx.Init(remez, CG_tolerance,false);
+	approx_inv.Init(remez, CG_tolerance,true);
+      }
+
+
+    protected:
+      static constexpr bool Numerator = true;
+      static constexpr bool Denominator = false;
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const GaugeField &U){
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+      }
+      
+    public:
+
+      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+						     FermionOperator<Impl>  &_DenOp, 
+						     const Params & p
+						     ) : 
+	NumOp(_NumOp), 
+	DenOp(_DenOp), 
+	PhiOdd (_NumOp.FermionRedBlackGrid()),
+	PhiEven(_NumOp.FermionRedBlackGrid()),
+	param(p) 
+      {
+	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	//Generate approximations for action eval
+	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
+	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
+
+	//Generate approximations for MD
+	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
+	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
+	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
+	}else{
+	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
+	  ApproxPowerMD = ApproxPowerAction; 
+	  ApproxNegPowerMD = ApproxNegPowerAction;
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
+
+	  ApproxHalfPowerMD = ApproxHalfPowerAction;
+	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
+	}
+
+	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
+      };
+
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
+	return sstream.str();
+      }
+
+      //Access the fermion field
+      const FermionField &getPhiOdd() const{ return PhiOdd; }
+      
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+	FermionField eta(NumOp.FermionGrid());	
+
+	// P(eta) \propto e^{- eta^dag eta}
+	//	
+	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
+	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
+	RealD scale = std::sqrt(0.5);
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	refresh(U,eta);
+      }
+
+      //Allow for manual specification of random field for testing
+      void refresh(const GaugeField &U, const FermionField &eta) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
+	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
+	//
+	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
+	
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+
+	FermionField etaOdd (NumOp.FermionRedBlackGrid());
+	FermionField etaEven(NumOp.FermionRedBlackGrid());
+	FermionField     tmp(NumOp.FermionRedBlackGrid());
+
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	ImportGauge(U);
+
+	// MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
+	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
+
+	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
+	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
+		
+	assert(NumOp.ConstEE() == 1);
+	assert(DenOp.ConstEE() == 1);
+	PhiEven = Zero();
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
+	ImportGauge(U);
+
+	FermionField X(NumOp.FermionRedBlackGrid());
+	FermionField Y(NumOp.FermionRedBlackGrid());
+
+	// VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
+
+	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
+
+	// Randomly apply rational bounds checks.
+	int rcheck = rand();
+	auto grid = NumOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+
+	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
+	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
+	  FermionField gauss(NumOp.FermionRedBlackGrid());
+	  gauss = PhiOdd;
+	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
+	  HighBoundCheck(MdagM,gauss,param.hi);
+	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
+	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
+	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
+	}
+
+	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	RealD action = norm2(Y);
+	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
+	const int n_f  = ApproxNegPowerMD.poles.size();
+	const int n_pv = ApproxHalfPowerMD.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
+
+	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField           Y(NumOp.FermionRedBlackGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	ImportGauge(U);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+		
+
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+
+
+	RealD ak;
+
+	dSdU = Zero();
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)	
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
+	for(int k=0;k<n_f;k++){
+	  ak = ApproxNegPowerMD.residues[k];
+	  MdagM.Mpc(MfMpvPhi_k[k],Y);
+	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
+	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
+	for(int k=0;k<n_pv;k++){
+
+          ak = ApproxHalfPowerMD.residues[k];
+	  
+	  VdagV.Mpc(MpvPhi_k[k],Y);
+	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
+	  
+	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
+
+	}
+
+	//dSdU = Ta(dSdU);
+	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
+      };
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
+    // cf. GeneralEvenOddRational.h for details
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      
+    template<class ImplD, class ImplF>
+    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
+    private:
+      typedef typename ImplD::FermionField FermionFieldD;
+      typedef typename ImplF::FermionField FermionFieldF;
+
+      FermionOperator<ImplD> & NumOpD;
+      FermionOperator<ImplD> & DenOpD;
+     
+      FermionOperator<ImplF> & NumOpF;
+      FermionOperator<ImplF> & DenOpF;
+
+      Integer ReliableUpdateFreq;
+    protected:
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
+	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
+	precisionChange(Uf, Ud);
+	
+	NumOpD.ImportGauge(Ud);
+	DenOpD.ImportGauge(Ud);
+
+	NumOpF.ImportGauge(Uf);
+	DenOpF.ImportGauge(Uf);
+      }
+      
+    public:
+      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
+							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
+							      const RationalActionParams & p, Integer _ReliableUpdateFreq
+							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
+								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
+      
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -40,249 +40,31 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
    public:
-
-      INHERIT_IMPL_TYPES(Impl);
-
      typedef OneFlavourRationalParams Params;
-      Params param;
-
-      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
-      MultiShiftFunction PowerQuarter;
-      MultiShiftFunction PowerNegQuarter;
-
    private:
-     
-      FermionOperator<Impl> & NumOp;// the basic operator
-      FermionOperator<Impl> & DenOp;// the basic operator
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+      static RationalActionParams transcribe(const Params &in){
+	RationalActionParams out;
+	out.inv_pow = 2;
+	out.lo = in.lo;
+	out.hi = in.hi;
+	out.MaxIter = in.MaxIter;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
+	out.action_degree = out.md_degree = in.degree;
+	out.precision = in.precision;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
+	return out;
+      }

    public:
-
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 							FermionOperator<Impl>  &_DenOp, 
-					    Params & p
+							const Params & p
 							) : 
-      NumOp(_NumOp), 
-      DenOp(_DenOp), 
-      PhiOdd (_NumOp.FermionRedBlackGrid()),
-      PhiEven(_NumOp.FermionRedBlackGrid()),
-      param(p) 
-      {
-	AlgRemez remez(param.lo,param.hi,param.precision);
-
-	// MdagM^(+- 1/2)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-	remez.generateApprox(param.degree,1,2);
-	PowerHalf.Init(remez,param.tolerance,false);
-	PowerNegHalf.Init(remez,param.tolerance,true);
-
-	// MdagM^(+- 1/4)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
-	remez.generateApprox(param.degree,1,4);
-   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-      };
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}

      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
-
-      virtual std::string LogParameters(){
-	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
-	return sstream.str();
-      }
-      
-      
-      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
-
-	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-	//
-	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
-	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
-	//
-	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
-
-	RealD scale = std::sqrt(0.5);
-
-	FermionField eta(NumOp.FermionGrid());
-	FermionField etaOdd (NumOp.FermionRedBlackGrid());
-	FermionField etaEven(NumOp.FermionRedBlackGrid());
-	FermionField     tmp(NumOp.FermionRedBlackGrid());
-
-	gaussian(pRNG,eta);	eta=eta*scale;
-
-	pickCheckerboard(Even,etaEven,eta);
-	pickCheckerboard(Odd,etaOdd,eta);
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-
-	// MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
-	msCG_M(MdagM,etaOdd,tmp);
-
-	// VdagV^-1/4 MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
-	msCG_V(VdagV,tmp,PhiOdd);
-
-	assert(NumOp.ConstEE() == 1);
-	assert(DenOp.ConstEE() == 1);
-	PhiEven = Zero();
-	
-      };
-
-      //////////////////////////////////////////////////////
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	FermionField X(NumOp.FermionRedBlackGrid());
-	FermionField Y(NumOp.FermionRedBlackGrid());
-
-	// VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	msCG_V(VdagV,PhiOdd,X);
-
-	// MdagM^-1/4 VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
-	msCG_M(MdagM,X,Y);
-
-	// Randomly apply rational bounds checks.
-	auto grid = NumOp.FermionGrid();
-        auto r=rand();
-        grid->Broadcast(0,r);
-        if ( (r%param.BoundsCheckFreq)==0 ) { 
-	  FermionField gauss(NumOp.FermionRedBlackGrid());
-	  gauss = PhiOdd;
-	  HighBoundCheck(MdagM,gauss,param.hi);
-	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
-	}
-
-	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
-	RealD action = norm2(Y);
-
-	return action;
-      };
-
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //
-      // Here, M is some 5D operator and V is the Pauli-Villars field
-      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
-      //
-      // Need  
-      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
-      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
-      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
-      //
-      // P/Q is expressed as partial fraction expansion: 
-      // 
-      //           a0 + \sum_k ak/(V^dagV + bk) 
-      //  
-      // d[P/Q] is then  
-      //
-      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
-      //  
-      // and similar for N/D. 
-      // 
-      // Need   
-      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
-      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
-      //   
-      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
-      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
-      // 
-      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
-      //  
-
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
-	const int n_f  = PowerNegHalf.poles.size();
-	const int n_pv = PowerQuarter.poles.size();
-
-	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
-
-	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField           Y(NumOp.FermionRedBlackGrid());
-
-	GaugeField   tmp(NumOp.GaugeGrid());
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
-
-	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
-	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
-	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
-
-	RealD ak;
-
-	dSdU = Zero();
-
-	// With these building blocks  
-	//  
-	//       dS/dU = 
-	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
-	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
-	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
-
-	//(1)
-	for(int k=0;k<n_f;k++){
-	  ak = PowerNegHalf.residues[k];
-	  MdagM.Mpc(MfMpvPhi_k[k],Y);
-	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
-	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
-	}
-	
-	//(2)
-	//(3)
-	for(int k=0;k<n_pv;k++){
-
-          ak = PowerQuarter.residues[k];
-	  
-	  VdagV.Mpc(MpvPhi_k[k],Y);
-	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
-	  
-	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
-	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
-
-	}
-
-	//dSdU = Ta(dSdU);
-
-      };
    };

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@@ -40,6 +40,8 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>

--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -83,16 +83,10 @@ NAMESPACE_BEGIN(Grid);
 	return sstream.str();
      } 

+      //Access the fermion field
+      const FermionField &getPhiOdd() const{ return PhiOdd; }

      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
-
-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
-        //
-        // NumOp == V
-        // DenOp == M
-        //
-        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
-        //
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@@ -100,12 +94,22 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);

        FermionField eta    (NumOp.FermionGrid());
+        gaussian(pRNG,eta); eta = eta * scale;
+
+	refresh(U,eta);
+      }
+	
+      void refresh(const GaugeField &U, const FermionField &eta) {
+        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+        //
+        // NumOp == V
+        // DenOp == M
+        //
+        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());

-        gaussian(pRNG,eta);
-
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);

@@ -125,8 +129,8 @@ NAMESPACE_BEGIN(Grid);
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);

-        PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
+        //PhiOdd =PhiOdd*scale;
+        //PhiEven=PhiEven*scale;
        
      };

--- a/Grid/qcd/gparity/Gparity.h
+++ b/Grid/qcd/gparity/Gparity.h
@@ -0,0 +1,6 @@
+#ifndef GRID_GPARITY_H_
+#define GRID_GPARITY_H_
+
+#include<Grid/qcd/gparity/GparityFlavour.h>
+
+#endif
--- a/Grid/qcd/gparity/GparityFlavour.cc
+++ b/Grid/qcd/gparity/GparityFlavour.cc
@@ -0,0 +1,34 @@
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
+    GparityFlavour(GparityFlavour::Algebra::SigmaX),
+    GparityFlavour(GparityFlavour::Algebra::SigmaY),
+    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
+    }};
+
+const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
+  GparityFlavour(GparityFlavour::Algebra::Identity),
+  GparityFlavour(GparityFlavour::Algebra::SigmaX),
+  GparityFlavour(GparityFlavour::Algebra::SigmaY),
+  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
+  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
+  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
+}};
+
+const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
+    "SigmaX",
+    "MinusSigmaX",
+    "SigmaY",
+    "MinusSigmaY",
+    "SigmaZ",
+    "MinusSigmaZ",
+    "Identity",
+    "MinusIdentity",
+    "ProjPlus",
+    "MinusProjPlus",
+    "ProjMinus",
+    "MinusProjMinus"}};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/GparityFlavour.h
+++ b/Grid/qcd/gparity/GparityFlavour.h
@@ -0,0 +1,475 @@
+#ifndef GRID_QCD_GPARITY_FLAVOUR_H
+#define GRID_QCD_GPARITY_FLAVOUR_H
+
+//Support for flavour-matrix operations acting on the G-parity flavour index
+
+#include <array>
+
+NAMESPACE_BEGIN(Grid);
+
+class GparityFlavour {
+  public:
+    GRID_SERIALIZABLE_ENUM(Algebra, undef,
+                           SigmaX, 0,
+			   MinusSigmaX, 1,
+                           SigmaY, 2,
+			   MinusSigmaY, 3,
+                           SigmaZ, 4,
+			   MinusSigmaZ, 5,
+			   Identity, 6,
+			   MinusIdentity, 7,
+			   ProjPlus, 8,
+			   MinusProjPlus, 9,
+			   ProjMinus, 10,
+			   MinusProjMinus, 11
+			   );
+    static constexpr unsigned int nSigma = 12;
+    static const std::array<const char *, nSigma>                name;
+    static const std::array<const GparityFlavour, 3>             sigma_mu;
+    static const std::array<const GparityFlavour, 6>            sigma_all;
+    Algebra                                                      g;
+  public:
+  accelerator GparityFlavour(Algebra initg): g(initg) {}  
+};
+
+
+
+// 0 1  x   vector
+// 1 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(1);
+  ret(1) = rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(1,0);
+  ret(0,1) = rhs(1,1);
+  ret(1,0) = rhs(0,0);
+  ret(1,1) = rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,1);
+  ret(0,1) = rhs(0,0);
+  ret(1,0) = rhs(1,1);
+  ret(1,1) = rhs(1,0);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(1);
+  ret(1) = -rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(1,0);
+  ret(0,1) = -rhs(1,1);
+  ret(1,0) = -rhs(0,0);
+  ret(1,1) = -rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,1);
+  ret(0,1) = -rhs(0,0);
+  ret(1,0) = -rhs(1,1);
+  ret(1,1) = -rhs(1,0);
+};
+
+
+
+
+
+// 0 -i  x   vector
+// i 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesMinusI(rhs(1));
+  ret(1) = timesI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(1,0));
+  ret(0,1) = timesMinusI(rhs(1,1));
+  ret(1,0) = timesI(rhs(0,0));
+  ret(1,1) = timesI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(0,1));
+  ret(0,1) = timesMinusI(rhs(0,0));
+  ret(1,0) = timesI(rhs(1,1));
+  ret(1,1) = timesMinusI(rhs(1,0));
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesI(rhs(1));
+  ret(1) = timesMinusI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(1,0));
+  ret(0,1) = timesI(rhs(1,1));
+  ret(1,0) = timesMinusI(rhs(0,0));
+  ret(1,1) = timesMinusI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(0,1));
+  ret(0,1) = timesI(rhs(0,0));
+  ret(1,0) = timesMinusI(rhs(1,1));
+  ret(1,1) = timesI(rhs(1,0));
+};
+
+
+
+
+
+// 1 0  x   vector
+// 0 -1
+template<class vtype>
+accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+
+
+
+
+
+template<class vtype>
+accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1+\sigma_2)
+//1 -i
+//i  1
+template<class vtype>
+accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1-\sigma_2)
+//1 i
+//-i  1
+template<class vtype>
+accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+
+
+
+
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
+{
+  iVector<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    multFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    multFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    multFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    multFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    multFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    multFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    multFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    multFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    multFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    multFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    multFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    multFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+ 
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    lmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    lmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    lmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    lmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    lmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    lmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    lmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    lmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    lmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    lmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    lmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    lmultFlavourMinusProjMinus(ret, arg); break;  
+  default: assert(0);
+  }
+  
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    rmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    rmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    rmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    rmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    rmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    rmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    rmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    rmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    rmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    rmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    rmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    rmultFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif // include guard
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -129,18 +129,10 @@ public:
    Runner(S);
  }

-  //////////////////////////////////////////////////////////////////
-
-private:
-  template <class SmearingPolicy>
-  void Runner(SmearingPolicy &Smearing) {
-    auto UGrid = Resources.GetCartesian();
-    Resources.AddRNGs();
-    Field U(UGrid);
-
-    // Can move this outside?
-    typedef IntegratorType<SmearingPolicy> TheIntegrator;
-    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
+  void initializeGaugeFieldAndRNGs(Field &U){
+    if(!Resources.haveRNGs()) Resources.AddRNGs();

    if (Parameters.StartingType == "HotStart") {
      // Hot start
@@ -159,14 +151,40 @@ private:
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
+    } else if (Parameters.StartingType == "CheckpointStartReseed") {
+      // Same as CheckpointRestart but reseed the RNGs using the fixed integer seeding used for ColdStart and HotStart
+      // Useful for creating new evolution streams from an existing stream
+      
+      // WARNING: Unfortunately because the checkpointer doesn't presently allow us to separately restore the RNG and gauge fields we have to load
+      // an existing RNG checkpoint first; make sure one is available and named correctly
+      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
+						     Resources.GetSerialRNG(),
+						     Resources.GetParallelRNG());
+      Resources.SeedFixedIntegers();      
    } else {
      // others
      std::cout << GridLogError << "Unrecognized StartingType\n";
      std::cout
 	<< GridLogError
-	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart, CheckpointStartReseed]\n";
      exit(1);
    }
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////
+
+private:
+  template <class SmearingPolicy>
+  void Runner(SmearingPolicy &Smearing) {
+    auto UGrid = Resources.GetCartesian();
+    Field U(UGrid);
+
+    initializeGaugeFieldAndRNGs(U);
+
+    typedef IntegratorType<SmearingPolicy> TheIntegrator;
+    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);

    Smearing.set_Field(U);

--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -115,21 +115,21 @@ private:

    random(sRNG, rn_test);

-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
+    std::cout << GridLogHMC << "exp(-dH) = " << prob
              << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";

    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return false;
    }
@@ -145,7 +145,7 @@ private:

    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);

    TheIntegrator.integrate(U);
@@ -165,7 +165,7 @@ private:


    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+    std::cout << GridLogHMC << "Total H after trajectory  = " << H1
 	      << "  dH = " << H1 - H0 << "\n";
    std::cout.precision(current_precision);
    
@@ -196,9 +196,9 @@ public:
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
      
      double t0=usecond();
@@ -207,10 +207,10 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
      }

      if (accept)
@@ -219,7 +219,7 @@ public:
     
      
      double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;


      for (int obs = 0; obs < Observables.size(); obs++) {
@@ -228,7 +228,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }

--- a/Grid/qcd/hmc/HMCModules.h
+++ b/Grid/qcd/hmc/HMCModules.h
@@ -80,7 +80,9 @@ public:
      std::cout << GridLogError << "Seeds not initialized" << std::endl;
      exit(1);
    }
+    std::cout << GridLogMessage << "Reseeding serial RNG with seed vector " << SerialSeeds << std::endl;
    sRNG_.SeedFixedIntegers(SerialSeeds);
+    std::cout << GridLogMessage << "Reseeding parallel RNG with seed vector " << ParallelSeeds << std::endl;
    pRNG_->SeedFixedIntegers(ParallelSeeds);
  }
 };
--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@@ -227,6 +227,9 @@ public:
  // Random number generators
  //////////////////////////////////////////////////////
  
+  //Return true if the RNG objects have been instantiated
+  bool haveRNGs() const{ return have_RNG; }
+
  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
    // the default is the first grid registered
--- a/Grid/qcd/hmc/UsingHMC.md
+++ b/Grid/qcd/hmc/UsingHMC.md
@@ -1,61 +1,63 @@
-Using HMC in Grid version 0.5.1
+# Using HMC in Grid

-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
+These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
+Disclaimer: Grid is still under active development so any information here can be changed in future releases.


-Command line options
-===================
-(relevant file GenericHMCrunner.h)
+## Command line options
+
+(relevant file `GenericHMCrunner.h`)
 The initial configuration can be changed at the command line using 
--StartType <your choice>
-valid choices, one among these
-HotStart, ColdStart, TepidStart, CheckpointStart
-default: HotStart
+`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
+`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
+Default: `--StartingType HotStart`

-example
-./My_hmc_exec  --StartType HotStart
+Example:
+```
+./My_hmc_exec  --StartingType HotStart
+```

-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
-default: 0
+The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
+`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
+Default: `--StartingTrajectory 0`

 The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
-default: 1
+`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
+Default: `--Trajectories 1`

 The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
-default: 10
-
+`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
+Default: `--Thermalizations 10`

 Any other parameter is defined in the source for the executable.

-HMC controls
-===========
+## HMC controls

 The lines 

+```
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
  std::vector<int> ParSeed({6, 7, 8, 9, 10});
+```

 define the seeds for the serial and the parallel RNG.

 The line 

+```
  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
+```

 declares the number of molecular dynamics steps and the total trajectory length.


-Actions
-======
+## Actions

-Action names are defined in the file
-lib/qcd/Actions.h
+Action names are defined in the directory `Grid/qcd/action`.

-Gauge actions list:
+Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):

+```
 WilsonGaugeActionR;
 WilsonGaugeActionF;
 WilsonGaugeActionD;
@@ -68,8 +70,9 @@ IwasakiGaugeActionD;
 SymanzikGaugeActionR;
 SymanzikGaugeActionF;
 SymanzikGaugeActionD;
+```

-
+```
 ConjugateWilsonGaugeActionR;
 ConjugateWilsonGaugeActionF;
 ConjugateWilsonGaugeActionD;
@@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
 ConjugateSymanzikGaugeActionR;
 ConjugateSymanzikGaugeActionF;
 ConjugateSymanzikGaugeActionD;
+```

+Each of these action accepts one single parameter at creation time (beta).
+Example for creating a Symanzik action with beta=4.0

+```
+  SymanzikGaugeActionR(4.0)
+```
+
+Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
+
+```
 ScalarActionR;
 ScalarActionF;
 ScalarActionD;
+```

-
-each of these action accept one single parameter at creation time (beta).
-Example for creating a Symanzik action with beta=4.0
-
-	SymanzikGaugeActionR(4.0)
-
-The suffixes R,F,D in the action names refer to the Real
-(the precision is defined at compile time by the --enable-precision flag in the configure),
-Float and Double, that force the precision of the action to be 32, 64 bit respectively.
-
-
-
-
-
-
-
-
+The suffixes `R`, `F`, `D` in the action names refer to the `Real`
+(the precision is defined at compile time by the `--enable-precision` flag in the configure),
+`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -136,8 +136,14 @@ protected:
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+
+      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
+      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      Real max_force_abs = std::sqrt(maxLocalNorm2(force));
+      Real max_impulse_abs = max_force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << " Max force: " << max_force_abs << " Time step: " << ep << " Impulse average: " << impulse_abs << " Max impulse: " << max_impulse_abs << std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
@@ -249,15 +255,19 @@ public:
  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
    assert(P.Grid() == U.Grid());
-    std::cout << GridLogIntegrator << "Integrator refresh\n";
+    std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;

+    std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
    FieldImplementation::generate_momenta(P, sRNG, pRNG);

    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
    // of the Metropolis
+    std::cout << GridLogIntegrator << "Updating smeared fields" << std::endl;
    Smearer.set_Field(U);
    // Set the (eventual) representations gauge fields
+
+    std::cout << GridLogIntegrator << "Updating representations" << std::endl;
    Representations.update(U);

    // The Smearer is attached to a pointer of the gauge field
@@ -267,6 +277,7 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
+	std::cout << GridLogIntegrator << "Refreshing integrator level " << level << " index " << actionID << std::endl;
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
      }
--- a/Grid/qcd/observables/topological_charge.h
+++ b/Grid/qcd/observables/topological_charge.h
@@ -99,7 +99,7 @@ public:
 	// using wilson flow by default here
 	WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 	WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
-	Real T0   = WF.energyDensityPlaquette(Usmear);
+	Real T0   = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
 	std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 		  << "T0                : [ " << traj << " ] "<< T0 << std::endl;
      }
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -7,6 +7,7 @@ Source file: ./lib/qcd/modules/plaquette.h
 Copyright (C) 2017

 Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,28 +34,44 @@ NAMESPACE_BEGIN(Grid);

 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
-  unsigned int Nstep;
-  unsigned int measure_interval;
-  mutable RealD epsilon, taus;
+public:
+  //Store generic measurements to take during smearing process using std::function
+  typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
  
+private:
+  unsigned int Nstep;
+  RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step
+ 
+  std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency

  mutable WilsonGaugeAction<Gimpl> SG;

-  void evolve_step(typename Gimpl::GaugeField&) const;
-  void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
-  RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
+  //Evolve the gauge field by 1 step and update tau
+  void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
+  //Evolve the gauge field by 1 step and update tau and the current time step eps
+  void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const;

 public:
  INHERIT_GIMPL_TYPES(Gimpl)

+  void resetActions(){ functions.clear(); }
+
+  void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
+
+  //Set the class to perform the default measurements: 
+  //the plaquette energy density every step
+  //the plaquette topological charge every 'topq_meas_interval' steps
+  //and output to stdout
+  void setDefaultMeasurements(int topq_meas_interval = 1);
+
  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
  Nstep(Nstep),
    epsilon(epsilon),
-    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
    assert(epsilon > 0.0);
    LogMessage();
+    setDefaultMeasurements(interval);
  }

  void LogMessage() {
@@ -73,9 +90,29 @@ public:
    // undefined for WilsonFlow
  }

-  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
-  RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
-  RealD energyDensityPlaquette(const GaugeField& U) const;
+  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const;
+
+  //Compute t^2 <E(t)> for time t from the plaquette
+  static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
+
+  //Compute t^2 <E(t)> for time t from the 1x1 cloverleaf form
+  //t is the Wilson flow time
+  static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U);
+  
+  //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps
+  //The smeared field is output as V
+  std::vector<RealD> flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1);
+
+  //Version that does not return the smeared field
+  std::vector<RealD> flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1);
+
+
+  //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps
+  //The smeared field is output as V
+  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1);
+
+  //Version that does not return the smeared field
+  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
 };


@@ -83,7 +120,7 @@ public:
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
+void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
  GaugeField Z(U.Grid());
  GaugeField tmp(U.Grid());
  SG.deriv(U, Z);
@@ -99,12 +136,13 @@ void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  tau += epsilon;
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
-  if (maxTau - taus < epsilon){
-    epsilon = maxTau-taus;
+void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{
+  if (maxTau - tau < eps){
+    eps = maxTau-tau;
  }
  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
@@ -114,95 +152,151 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
+  Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0

  Z *= -17.0/8.0;
  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
+  Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
    

  Z *= -4.0/3.0;
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2

  // Ramos 
-  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
+  Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
  RealD diff = norm2(diffU);
  // adjust integration step
    
-  taus += epsilon;
+  tau += eps;
  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    
-  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
+  eps = eps*0.95*std::pow(1e-4/diff,1./3.);
  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;

 }

+
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
-  RealD td = tau(step);
-  return 2.0 * td * td * SG.S(U)/U.Grid()->gSites();
+RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
+  static WilsonGaugeAction<Gimpl> SG(3.0);
+  return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
+}
+
+//Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
+template <class Gimpl>
+RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  assert(Nd == 4);
+  //E = 1/2 tr( F_munu F_munu )
+  //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
+  //F_01 F_02 F_03   F_12 F_13  F_23
+  GaugeMat F(U.Grid());
+  LatticeComplexD R(U.Grid());
+  R = Zero();
+  
+  for(int mu=0;mu<3;mu++){
+    for(int nu=mu+1;nu<4;nu++){
+      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
+      R = R + trace(F*F);
+    }
+  }
+  ComplexD out = sum(R);
+  out = t*t*out / RealD(U.Grid()->gSites());
+  return -real(out); //minus sign necessary for +ve energy
+}
+
+
+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
+  std::vector<RealD> out;
+  resetActions();
+  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
+      std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl;
+      out.push_back( energyDensityPlaquette(t,U) );
+    });      
+  smear(V,U);
+  return out;
 }

 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
-  return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites();
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
+  GaugeField V(U);
+  return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
 }

+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
+  std::vector<RealD> out;
+  resetActions();
+  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
+      std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl;
+      out.push_back( energyDensityCloverleaf(t,U) );
+    });      
+  smear(V,U);
+  return out;
+}
+
+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
+  GaugeField V(U);
+  return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
+}
+
+

 //#define WF_TIMING 
-
-
-
 template <class Gimpl>
 void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
  out = in;
-  for (unsigned int step = 1; step <= Nstep; step++) {
+  RealD taus = 0.;
+  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
-    evolve_step(out);
+    evolve_step(out, taus);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
 #ifdef WF_TIMING
    std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << tau(step) << "  " 
-	      << energyDensityPlaquette(step,out) << std::endl;
-    if( step % measure_interval == 0){
-      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-		<< step << "  " 
-		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-    }
+    //Perform measurements
+    for(auto const &meas : functions)
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
  }
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
+void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{
  out = in;
-  taus = epsilon;
+  RealD taus = 0.;
+  RealD eps = epsilon;
  unsigned int step = 0;
  do{
    step++;
    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
-    evolve_step_adaptive(out, maxTau);
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << taus << "  "
-	      << energyDensityPlaquette(out) << std::endl;
-    if( step % measure_interval == 0){
-      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-		<< step << "  " 
-		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-    }
+    evolve_step_adaptive(out, taus, eps, maxTau);
+    //Perform measurements
+    for(auto const &meas : functions)
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
  } while (taus < maxTau);
-
-
-
 }

+template <class Gimpl>
+void WilsonFlow<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
+  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
+    });
+  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
+    });
+}
+
+
 NAMESPACE_END(Grid);

--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -88,6 +88,12 @@ namespace PeriodicBC {
    return CovShiftBackward(Link,mu,arg);
  }

+  //Boundary-aware C-shift of gauge links / gauge transformation matrices
+  template<class gauge> Lattice<gauge>
+  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
+  {
+    return Cshift(Link, mu, shift);
+  }

 }

@@ -158,6 +164,9 @@ namespace ConjugateBC {
    //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
+
+  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
+  //       = U^T_\mu(L-1)  | x_\mu == 0
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
    GridBase *grid = Link.Grid();
@@ -176,6 +185,9 @@ namespace ConjugateBC {
    return Link;
  }

+  //Out(x) = S_\mu(x+\hat\mu)  | x_\mu != L-1
+  //       = S*_\mu(0)  | x_\mu == L-1
+  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
@@ -208,6 +220,35 @@ namespace ConjugateBC {
    return CovShiftBackward(Link,mu,arg);
  }

+  //Boundary-aware C-shift of gauge links / gauge transformation matrices
+  //shift = 1
+  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
+  //       = U*_\mu(0)  | x_\mu == L-1
+  //shift = -1
+  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
+  //       = U*_\mu(L-1)  | x_\mu == 0
+  template<class gauge> Lattice<gauge>
+  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
+  {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    if(shift == 1){
+      tmp = Cshift(Link, mu, 1);
+      tmp = where(coor == Lmu, conjugate(tmp), tmp);
+      return tmp;
+    }else if(shift == -1){
+      tmp = Link;
+      tmp = where(coor == Lmu, conjugate(tmp), tmp);
+      return Cshift(tmp, mu, -1);
+    }else assert(0 && "Invalid shift value");
+    return tmp; //shuts up the compiler fussing about the return type
+  }
+
 }


--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@@ -40,27 +40,46 @@ public:
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;

-  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
-    for(int mu=0;mu<Nd;mu++){
+  //A_\mu(x) = -i Ta(U_\mu(x) )   where Ta(U) = 1/2( U - U^dag ) - 1/2N tr(U - U^dag)  is the traceless antihermitian part. This is an O(A^3) approximation to the logarithm of U
+  static void GaugeLinkToLieAlgebraField(const GaugeMat &U, GaugeMat &A) {
    Complex cmi(0.0,-1.0);
-      A[mu] = Ta(U[mu]) * cmi;
+    A = Ta(U) * cmi;
  }
-  }
-  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu,int orthog) {
+  
+  //The derivative of the Lie algebra field
+  static void DmuAmu(const std::vector<GaugeMat> &U, GaugeMat &dmuAmu,int orthog) {
+    GridBase* grid = U[0].Grid();
+    GaugeMat Ax(grid);
+    GaugeMat Axm1(grid);
+    GaugeMat Utmp(grid);
+
    dmuAmu=Zero();
    for(int mu=0;mu<Nd;mu++){
      if ( mu != orthog ) {
-	dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+	//Rather than define functionality to work out how the BCs apply to A_\mu we simply use the BC-aware Cshift to the gauge links and compute A_\mu(x) and A_\mu(x-1) separately
+	//Ax = A_\mu(x)
+	GaugeLinkToLieAlgebraField(U[mu], Ax);
+	
+	//Axm1 = A_\mu(x_\mu-1)
+	Utmp = Gimpl::CshiftLink(U[mu], mu, -1);
+	GaugeLinkToLieAlgebraField(Utmp, Axm1);
+	
+	//Derivative
+	dmuAmu = dmuAmu + Ax - Axm1;
      }
    }
  }  

-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  //Fix the gauge field Umu
+  //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
  }
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+
+  //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {

    GridBase *grid = Umu.Grid();

@@ -122,27 +141,24 @@ public:

      }
    }
+    assert(0 && "Gauge fixing did not converge within the specified number of iterations");
  };
-  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();

-    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
-
-    GaugeLinkToLieAlgebraField(U,A);
-    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu,orthog);
-
+    ExpiAlphaDmuAmu(U,g,alpha,dmuAmu,orthog);

    Real vol = grid->gSites();
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;

    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);

    return trG;
  }

-  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {

    GridBase *grid = U[0].Grid();

@@ -157,11 +173,7 @@ public:

    GaugeMat g(grid);
    GaugeMat dmuAmu_p(grid);
-    std::vector<GaugeMat> A(Nd,grid);
-
-    GaugeLinkToLieAlgebraField(U,A);
-
-    DmuAmu(A,dmuAmu,orthog);
+    DmuAmu(U,dmuAmu,orthog);

    std::vector<int> mask(Nd,1);
    for(int mu=0;mu<Nd;mu++) if (mu==orthog) mask[mu]=0;
@@ -205,16 +217,16 @@ public:
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;

    xform = g*xform ;
-    SU<Nc>::GaugeTransform(U,g);
+    SU<Nc>::GaugeTransform<Gimpl>(U,g);

    return trG;
  }

-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &U,GaugeMat &g, Real alpha, GaugeMat &dmuAmu,int orthog) {
    GridBase *grid = g.Grid();
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
-    DmuAmu(A,dmuAmu,orthog);
+    DmuAmu(U,dmuAmu,orthog);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
  }  
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@@ -694,32 +694,32 @@ public:
 * Adjoint rep gauge xform
 */

-  template<typename GaugeField,typename GaugeMat>
-  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
+  template<typename Gimpl>
+  static void GaugeTransform(typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = Umu.Grid();
    conformable(grid,g.Grid());

-    GaugeMat U(grid);
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField U(grid);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);

    for(int mu=0;mu<Nd;mu++){
      U= PeekIndex<LorentzIndex>(Umu,mu);
-      U = g*U*Cshift(ag, mu, 1);
+      U = g*U*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
      PokeIndex<LorentzIndex>(Umu,U,mu);
    }
  }
-  template<typename GaugeMat>
-  static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
+  template<typename Gimpl>
+  static void GaugeTransform( std::vector<typename Gimpl::GaugeLinkField> &U, typename Gimpl::GaugeLinkField &g){
    GridBase *grid = g.Grid();
-    GaugeMat ag(grid); ag = adj(g);
+    typename Gimpl::GaugeLinkField ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
-      U[mu] = g*U[mu]*Cshift(ag, mu, 1);
+      U[mu] = g*U[mu]*Gimpl::CshiftLink(ag, mu, 1); //BC-aware
    }
  }
-  template<typename GaugeField,typename GaugeMat>
-  static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){
+  template<typename Gimpl>
+  static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){
    LieRandomize(pRNG,g,1.0);
-    GaugeTransform(Umu,g);
+    GaugeTransform<Gimpl>(Umu,g);
  }

  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -125,6 +125,56 @@ public:
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }

+  //////////////////////////////////////////////////
+  // sum over all spatial planes of plaquette
+  //////////////////////////////////////////////////
+  static void siteSpatialPlaquette(ComplexField &Plaq,
+                            const std::vector<GaugeMat> &U) {
+    ComplexField sitePlaq(U[0].Grid());
+    Plaq = Zero();
+    for (int mu = 1; mu < Nd-1; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        traceDirPlaquette(sitePlaq, U, mu, nu);
+        Plaq = Plaq + sitePlaq;
+      }
+    }
+  }
+
+  ////////////////////////////////////
+  // sum over all x,y,z and over all spatial planes of plaquette
+  //////////////////////////////////////////////////
+  static std::vector<RealD> timesliceSumSpatialPlaquette(const GaugeLorentz &Umu) {
+    std::vector<GaugeMat> U(Nd, Umu.Grid());
+    // inefficient here
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+    ComplexField Plaq(Umu.Grid());
+
+    siteSpatialPlaquette(Plaq, U);
+    typedef typename ComplexField::scalar_object sobj;
+    std::vector<sobj> Tq;
+    sliceSum(Plaq, Tq, Nd-1);
+
+    std::vector<Real> out(Tq.size());
+    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
+    return out;
+  }
+  
+  //////////////////////////////////////////////////
+  // average over all x,y,z and over all spatial planes of plaquette
+  //////////////////////////////////////////////////
+  static std::vector<RealD> timesliceAvgSpatialPlaquette(const GaugeLorentz &Umu) {
+    std::vector<RealD> sumplaq = timesliceSumSpatialPlaquette(Umu);
+    int Lt = Umu.Grid()->FullDimensions()[Nd-1];
+    assert(sumplaq.size() == Lt);
+    double vol = Umu.Grid()->gSites() / Lt;
+    double faces = (1.0 * (Nd - 1)* (Nd - 2)) / 2.0;
+    for(int t=0;t<Lt;t++)
+      sumplaq[t] = sumplaq[t] / vol / faces / Nc; // Nd , Nc dependent... FIXME
+    return sumplaq;
+  }

  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
@@ -363,11 +413,11 @@ public:
    GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
    GaugeMat vu = v*u;
      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
-      FS = (u*v + Cshift(vu, mu, -1));
+      FS = (u*v + Gimpl::CshiftLink(vu, mu, -1));
      FS = 0.125*(FS - adj(FS));
  }

-  static Real TopologicalCharge(GaugeLorentz &U){
+  static Real TopologicalCharge(const GaugeLorentz &U){
    // 4d topological charge
    assert(Nd==4);
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
@@ -390,6 +440,203 @@ public:
  }


+  //Clover-leaf Wilson loop combination for arbitrary mu-extent M and nu extent N,  mu >= nu
+  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 7  for 1x2 Wilson loop    
+  //Clockwise ordering
+  static void CloverleafMxN(GaugeMat &FS, const GaugeMat &Umu, const GaugeMat &Unu, int mu, int nu, int M, int N){  
+#define Fmu(A) Gimpl::CovShiftForward(Umu, mu, A)
+#define Bmu(A) Gimpl::CovShiftBackward(Umu, mu, A)
+#define Fnu(A) Gimpl::CovShiftForward(Unu, nu, A)
+#define Bnu(A) Gimpl::CovShiftBackward(Unu, nu, A)
+#define FmuI Gimpl::CovShiftIdentityForward(Umu, mu)
+#define BmuI Gimpl::CovShiftIdentityBackward(Umu, mu)
+#define FnuI Gimpl::CovShiftIdentityForward(Unu, nu)
+#define BnuI Gimpl::CovShiftIdentityBackward(Unu, nu)
+
+    //Upper right loop
+    GaugeMat tmp = BmuI;
+    for(int i=1;i<M;i++)
+      tmp = Bmu(tmp);
+    for(int j=0;j<N;j++)
+      tmp = Bnu(tmp);
+    for(int i=0;i<M;i++)
+      tmp = Fmu(tmp);
+    for(int j=0;j<N;j++)
+      tmp = Fnu(tmp);
+      
+    FS = tmp;
+
+    //Upper left loop
+    tmp = BnuI;
+    for(int j=1;j<N;j++)
+      tmp = Bnu(tmp);
+    for(int i=0;i<M;i++)
+      tmp = Fmu(tmp);
+    for(int j=0;j<N;j++)
+      tmp = Fnu(tmp);
+    for(int i=0;i<M;i++)
+      tmp = Bmu(tmp);
+      
+    FS = FS + tmp;
+
+    //Lower right loop
+    tmp = FnuI;
+    for(int j=1;j<N;j++)
+      tmp = Fnu(tmp);
+    for(int i=0;i<M;i++)
+      tmp = Bmu(tmp);
+    for(int j=0;j<N;j++)
+      tmp = Bnu(tmp);
+    for(int i=0;i<M;i++)
+      tmp = Fmu(tmp);
+      
+    FS = FS + tmp;
+
+    //Lower left loop
+    tmp = FmuI;
+    for(int i=1;i<M;i++)
+      tmp = Fmu(tmp);
+    for(int j=0;j<N;j++)
+      tmp = Fnu(tmp);
+    for(int i=0;i<M;i++)
+      tmp = Bmu(tmp);
+    for(int j=0;j<N;j++)
+      tmp = Bnu(tmp);
+
+    FS = FS + tmp;
+
+#undef Fmu
+#undef Bmu
+#undef Fnu
+#undef Bnu
+#undef FmuI
+#undef BmuI
+#undef FnuI
+#undef BnuI
+  }
+
+  //Field strength from MxN Wilson loop
+  //Note F_numu = - F_munu
+  static void FieldStrengthMxN(GaugeMat &FS, const GaugeLorentz &U, int mu, int nu, int M, int N){  
+    GaugeMat Umu = PeekIndex<LorentzIndex>(U, mu);
+    GaugeMat Unu = PeekIndex<LorentzIndex>(U, nu);
+    if(M == N){
+      GaugeMat F(Umu.Grid());
+      CloverleafMxN(F, Umu, Unu, mu, nu, M, N);
+      FS = 0.125 * ( F - adj(F) );
+    }else{
+      //Average over both orientations
+      GaugeMat horizontal(Umu.Grid()), vertical(Umu.Grid());
+      CloverleafMxN(horizontal, Umu, Unu, mu, nu, M, N);
+      CloverleafMxN(vertical, Umu, Unu, mu, nu, N, M);
+      FS = 0.0625 * ( horizontal - adj(horizontal) + vertical - adj(vertical) );
+    }
+  }
+
+  //Topological charge contribution from MxN Wilson loops
+  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
+  //output is the charge by timeslice: sum over timeslices to obtain the total
+  static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
+    assert(Nd == 4);
+    std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
+    //Note F_numu = - F_munu
+    //hence we only need to loop over mu,nu,rho,sigma that aren't related by permuting mu,nu  or rho,sigma
+    //Use nu > mu
+    for(int mu=0;mu<Nd-1;mu++){
+      for(int nu=mu+1; nu<Nd; nu++){
+	F[mu][nu] = new GaugeMat(U.Grid());
+	FieldStrengthMxN(*F[mu][nu], U, mu, nu, M, N);
+      }
+    }
+    Real coeff = -1./(32 * M_PI*M_PI * M*M * N*N); //overall sign to match CPS and Grid conventions, possibly related to time direction = 3 vs 0
+
+    static const int combs[3][4] = { {0,1,2,3}, {0,2,1,3}, {0,3,1,2} };
+    static const int signs[3] = { 1, -1, 1 }; //epsilon_{mu nu rho sigma}
+
+    ComplexField fsum(U.Grid());
+    fsum = Zero();
+    for(int c=0;c<3;c++){
+      int mu = combs[c][0], nu = combs[c][1], rho = combs[c][2], sigma = combs[c][3];
+      int eps = signs[c];
+      fsum = fsum + (8. * coeff * eps) * trace( (*F[mu][nu]) * (*F[rho][sigma]) ); 
+    }
+
+    for(int mu=0;mu<Nd-1;mu++)
+      for(int nu=mu+1; nu<Nd; nu++)
+	delete F[mu][nu];
+    
+    typedef typename ComplexField::scalar_object sobj;
+    std::vector<sobj> Tq;
+    sliceSum(fsum, Tq, Nd-1);
+
+    std::vector<Real> out(Tq.size());
+    for(int t=0;t<Tq.size();t++) out[t] = TensorRemove(Tq[t]).real();
+    return out;
+  }
+  static Real TopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
+    std::vector<Real> Tq = TimesliceTopologicalChargeMxN(U,M,N);
+    Real out(0);
+    for(int t=0;t<Tq.size();t++) out += Tq[t];
+    return out;
+  }
+
+  //Generate the contributions to the 5Li topological charge from Wilson loops of the following sizes
+  //Use coefficients from hep-lat/9701012
+  //1x1 : c1=(19.-55.*c5)/9.
+  //2x2 : c2=(1-64.*c5)/9.
+  //1x2 : c3=(-64.+640.*c5)/45.
+  //1x3 : c4=1./5.-2.*c5
+  //3x3 : c5=1./20.
+  //Output array outer index contains the loops in the above order
+  //Inner index is the time coordinate
+  static std::vector<std::vector<Real> > TimesliceTopologicalCharge5LiContributions(const GaugeLorentz &U){
+    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };       
+    std::vector<std::vector<Real> > out(5);
+    for(int i=0;i<5;i++){	
+      out[i] = TimesliceTopologicalChargeMxN(U,exts[i][0],exts[i][1]);
+    }
+    return out;
+  }   
+
+  static std::vector<Real> TopologicalCharge5LiContributions(const GaugeLorentz &U){   
+    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
+    std::vector<Real> out(5);
+    std::cout << GridLogMessage << "Computing topological charge" << std::endl;
+    for(int i=0;i<5;i++){
+      out[i] = TopologicalChargeMxN(U,exts[i][0],exts[i][1]);
+      std::cout << GridLogMessage << exts[i][0] << "x" << exts[i][1] << " Wilson loop contribution " << out[i] << std::endl;
+    }
+    return out;
+  }
+
+  //Compute the 5Li topological charge
+  static std::vector<Real> TimesliceTopologicalCharge5Li(const GaugeLorentz &U){
+    std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
+
+    double c5=1./20.;
+    double c4=1./5.-2.*c5;
+    double c3=(-64.+640.*c5)/45.;
+    double c2=(1-64.*c5)/9.;
+    double c1=(19.-55.*c5)/9.;
+
+    int Lt = loops[0].size();
+    std::vector<Real> out(Lt,0.);
+    for(int t=0;t<Lt;t++)
+      out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
+    return out;
+  }
+
+  static Real TopologicalCharge5Li(const GaugeLorentz &U){
+    std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
+    Real Q = 0.;
+    for(int t=0;t<Qt.size();t++) Q += Qt[t];
+    std::cout << GridLogMessage << "5Li Topological charge: " << Q << std::endl;
+    return Q;
+  }
+
+
+
+
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
--- a/Grid/sitmo_rng/README
+++ b/Grid/sitmo_rng/README
--- a/Grid/random/gaussian.h
+++ b/Grid/random/gaussian.h
@@ -0,0 +1,200 @@
+// -*- C++ -*-
+//===--------------------------- random -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Peter Boyle: Taken from libc++ in Clang/LLVM.
+// Reason is that libstdc++ and clang differ in their return order in the normal_distribution / box mueller type step.
+// standardise on one and call it "gaussian_distribution".
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cmath>
+#include <type_traits>
+#include <initializer_list>
+#include <limits>
+#include <algorithm>
+#include <numeric>
+#include <vector>
+#include <string>
+#include <istream>
+#include <ostream>
+#include <random>
+
+// normal_distribution -> gaussian distribution
+namespace Grid {
+
+template<class _RealType = double>
+class  gaussian_distribution
+{
+public:
+    // types
+    typedef _RealType result_type;
+
+    class param_type
+    {
+        result_type __mean_;
+        result_type __stddev_;
+    public:
+        typedef gaussian_distribution distribution_type;
+
+        strong_inline
+        explicit param_type(result_type __mean = 0, result_type __stddev = 1)
+            : __mean_(__mean), __stddev_(__stddev) {}
+
+        strong_inline
+        result_type mean() const {return __mean_;}
+        strong_inline
+        result_type stddev() const {return __stddev_;}
+
+        friend strong_inline
+            bool operator==(const param_type& __x, const param_type& __y)
+            {return __x.__mean_ == __y.__mean_ && __x.__stddev_ == __y.__stddev_;}
+        friend strong_inline
+            bool operator!=(const param_type& __x, const param_type& __y)
+            {return !(__x == __y);}
+    };
+
+private:
+    param_type __p_;
+    result_type _V_;
+    bool _V_hot_;
+
+public:
+    // constructors and reset functions
+    strong_inline
+    explicit gaussian_distribution(result_type __mean = 0, result_type __stddev = 1)
+        : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
+    strong_inline
+    explicit gaussian_distribution(const param_type& __p)
+        : __p_(__p), _V_hot_(false) {}
+    strong_inline
+    void reset() {_V_hot_ = false;}
+
+    // generating functions
+    template<class _URNG>
+        strong_inline
+        result_type operator()(_URNG& __g)
+        {return (*this)(__g, __p_);}
+    template<class _URNG> result_type operator()(_URNG& __g, const param_type& __p);
+
+    // property functions
+    strong_inline
+    result_type mean() const {return __p_.mean();}
+    strong_inline
+    result_type stddev() const {return __p_.stddev();}
+
+    strong_inline
+    param_type param() const {return __p_;}
+    strong_inline
+    void param(const param_type& __p) {__p_ = __p;}
+
+    strong_inline
+    result_type min() const {return -std::numeric_limits<result_type>::infinity();}
+    strong_inline
+    result_type max() const {return std::numeric_limits<result_type>::infinity();}
+
+    friend strong_inline
+        bool operator==(const gaussian_distribution& __x,
+                        const gaussian_distribution& __y)
+        {return __x.__p_ == __y.__p_ && __x._V_hot_ == __y._V_hot_ &&
+                (!__x._V_hot_ || __x._V_ == __y._V_);}
+    friend strong_inline
+        bool operator!=(const gaussian_distribution& __x,
+                        const gaussian_distribution& __y)
+        {return !(__x == __y);}
+
+    template <class _CharT, class _Traits, class _RT>
+    friend
+    std::basic_ostream<_CharT, _Traits>&
+    operator<<(std::basic_ostream<_CharT, _Traits>& __os,
+               const gaussian_distribution<_RT>& __x);
+
+    template <class _CharT, class _Traits, class _RT>
+    friend
+    std::basic_istream<_CharT, _Traits>&
+    operator>>(std::basic_istream<_CharT, _Traits>& __is,
+               gaussian_distribution<_RT>& __x);
+};
+
+template <class _RealType>
+template<class _URNG>
+_RealType
+gaussian_distribution<_RealType>::operator()(_URNG& __g, const param_type& __p)
+{
+    result_type _Up;
+    if (_V_hot_)
+    {
+        _V_hot_ = false;
+        _Up = _V_;
+    }
+    else
+    {
+        std::uniform_real_distribution<result_type> _Uni(-1, 1);
+        result_type __u;
+        result_type __v;
+        result_type __s;
+        do
+        {
+            __u = _Uni(__g);
+            __v = _Uni(__g);
+            __s = __u * __u + __v * __v;
+        } while (__s > 1 || __s == 0);
+        result_type _Fp = std::sqrt(-2 * std::log(__s) / __s);
+        _V_ = __v * _Fp;
+        _V_hot_ = true;
+        _Up = __u * _Fp;
+    }
+    return _Up * __p.stddev() + __p.mean();
+}
+
+template <class _CharT, class _Traits, class _RT>
+std::basic_ostream<_CharT, _Traits>&
+operator<<(std::basic_ostream<_CharT, _Traits>& __os,
+           const gaussian_distribution<_RT>& __x)
+{
+    auto __save_flags = __os.flags();
+    __os.flags(std::ios_base::dec | std::ios_base::left | std::ios_base::fixed |
+               std::ios_base::scientific);
+    _CharT __sp = __os.widen(' ');
+    __os.fill(__sp);
+    __os << __x.mean() << __sp << __x.stddev() << __sp << __x._V_hot_;
+    if (__x._V_hot_)
+        __os << __sp << __x._V_;
+    __os.flags(__save_flags);
+    return __os;
+}
+
+template <class _CharT, class _Traits, class _RT>
+std::basic_istream<_CharT, _Traits>&
+operator>>(std::basic_istream<_CharT, _Traits>& __is,
+           gaussian_distribution<_RT>& __x)
+{
+    typedef gaussian_distribution<_RT> _Eng;
+    typedef typename _Eng::result_type result_type;
+    typedef typename _Eng::param_type param_type;
+    auto __save_flags = __is.flags();
+    __is.flags(std::ios_base::dec | std::ios_base::skipws);
+    result_type __mean;
+    result_type __stddev;
+    result_type _Vp = 0;
+    bool _V_hot = false;
+    __is >> __mean >> __stddev >> _V_hot;
+    if (_V_hot)
+        __is >> _Vp;
+    if (!__is.fail())
+    {
+        __x.param(param_type(__mean, __stddev));
+        __x._V_hot_ = _V_hot;
+        __x._V_ = _Vp;
+    }
+    __is.flags(__save_flags);
+    return __is;
+}
+}
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -322,8 +322,8 @@ public:
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;

-    int recv_from_rank;
-    int xmit_to_rank;
+    //    int recv_from_rank;
+    //    int xmit_to_rank;

    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -208,5 +208,46 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 }


+
+//////////////////////////////////////////////////////////////////////////////////
+//Copy a single lane of a SIMD tensor type from one object to another
+//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
+///////////////////////////////////////////////////////////////////////////////////
+template<class vobjOut, class vobjIn>
+accelerator_inline 
+void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
+{
+  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  typedef typename vobjOut::vector_type ovector_type;  
+  typedef typename vobjIn::vector_type ivector_type;  
+  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
+  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
+  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
+
+  typedef typename vobjOut::scalar_type oscalar_type;  
+  typedef typename vobjIn::scalar_type iscalar_type;  
+  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
+  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
+
+  typedef oextract_type * opointer;
+  typedef iextract_type * ipointer;
+
+  constexpr int oNsimd=ovector_type::Nsimd();
+  constexpr int iNsimd=ivector_type::Nsimd();
+
+  iscalar_type itmp;
+  oscalar_type otmp;
+
+  opointer __restrict__  op = (opointer)&vecOut;
+  ipointer __restrict__  ip = (ipointer)&vecIn;
+  for(int w=0;w<owords;w++){
+    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
+    otmp = itmp; //potential precision change
+    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
+  }
+}
+
+
 NAMESPACE_END(Grid);

--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
  class TypePair {
  public:
    T _internal[2];
-    TypePair<T>& operator=(const Grid::Zero& o) {
+    accelerator TypePair<T>& operator=(const Grid::Zero& o) {
      _internal[0] = Zero();
      _internal[1] = Zero();
      return *this;
    }

-    TypePair<T> operator+(const TypePair<T>& o) const {
+    accelerator TypePair<T> operator+(const TypePair<T>& o) const {
      TypePair<T> r;
      r._internal[0] = _internal[0] + o._internal[0];
      r._internal[1] = _internal[1] + o._internal[1];
      return r;
    }

-    TypePair<T>& operator+=(const TypePair<T>& o) {
+    accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
      _internal[0] += o._internal[0];
      _internal[1] += o._internal[1];
      return *this;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -74,29 +74,43 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
+
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP

 #ifdef GRID_DEFAULT_GPU
+  int device = 0;
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
-    printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
+    printf("AcceleratorCudaInit: assume user either uses\n");
+    printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
    printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
 #else
+  int device = rank;
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
  printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
-  cudaSetDevice(rank);
 #endif
+
+  cudaSetDevice(device);
+  cudaStreamCreate(&copyStream);
+  const int len=64;
+  char busid[len];
+  if( rank == world_rank ) { 
+    cudaDeviceGetPCIBusId(busid, len, device);
+    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
+  }
+
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif

 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
+hipStream_t copyStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -154,16 +168,25 @@ void acceleratorInit(void)
 #ifdef GRID_DEFAULT_GPU
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: using default device \n");
-    printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
-    printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
+    printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
  }
+  int device = 0;
 #else
  if ( world_rank == 0 ) {
    printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
-    printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
+    printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
  }
-  hipSetDevice(rank);
+  int device = rank;
 #endif
+  hipSetDevice(device);
+  hipStreamCreate(&copyStream);
+  const int len=64;
+  char busid[len];
+  if( rank == world_rank ) { 
+    hipDeviceGetPCIBusId(busid, len, device);
+    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
+  }
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 }
 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////

 #ifdef GRID_CUDA
+
 #include <cuda.h>

 #ifdef __CUDA_ARCH__
@@ -115,6 +116,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific

+inline void cuda_mem(void)
+{
+  size_t free_t,total_t,used_t;
+  cudaMemGetInfo(&free_t,&total_t);
+  used_t=total_t-free_t;
+  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
+}
+
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@@ -197,7 +206,8 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
  return ptr;
 };
@@ -207,20 +217,53 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
  }
  return ptr;
 };
-inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
+inline void acceleratorFreeShared(void *ptr){
+  auto err = cudaFree(ptr);
+  if( err != cudaSuccess ) {
+    printf(" cudaFree(Shared) failed %s \n",cudaGetErrorString(err)); fflush(stdout);
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
+  }
+};
+inline void acceleratorFreeDevice(void *ptr){
+  auto err = cudaFree(ptr);
+  if( err != cudaSuccess ) {
+    printf(" cudaFree(Device) failed %s \n",cudaGetErrorString(err)); fflush(stdout);
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
+  }
+};
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  {
+  auto err = cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);
+  if( err != cudaSuccess ) {
+    printf(" cudaMemcpy(host->device) failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
+  }
+}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){
+  auto err = cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);
+  if( err != cudaSuccess ) {
+    printf(" cudaMemcpy(device->host) failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
+  }
+}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) {
+  auto err = cudaMemset(base,value,bytes);
+  if( err != cudaSuccess ) {
+    printf(" cudaMemSet failed for %lu %s \n",bytes,cudaGetErrorString(err)); fflush(stdout);
+    if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);
+  }
+}
+
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 }
 inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
+
 inline int  acceleratorIsCommunicable(void *ptr)
 {
  //  int uvm=0;
@@ -297,7 +340,7 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {
  theGridAccelerator->memcpy(to,from,bytes);
 }
-inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
@@ -328,10 +371,11 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline

+extern hipStream_t copyStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-  return hipThreadIdx_z; 
+  return hipThreadIdx_x; 
 #else
  return 0;
 #endif
@@ -345,19 +389,41 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      { __VA_ARGS__;}							\
    };									\
    int nt=acceleratorThreads();					\
-    dim3 hip_threads(nt,1,nsimd);					\
+    dim3 hip_threads(nsimd, nt, 1);					 \
    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
+    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
+      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
+            0,0,						\
+            num1,num2,nsimd, lambda);				\
+    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
            0,0,						\
            num1,num2,nsimd, lambda);				\
+    } \
+  }
+
+
+template<typename lambda>  __global__
+__launch_bounds__(64,1)
+void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
+{
+  // Following the same scheme as CUDA for now
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
+  uint64_t z = threadIdx.x;
+  if ( (x < numx) && (y<numy) && (z<numz) ) {
+    Lambda(x,y,z);
+  }
 }

 template<typename lambda>  __global__
+__launch_bounds__(1024,1)
 void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 {
-  uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
-  uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
-  uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
+  // Following the same scheme as CUDA for now
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
+  uint64_t z = threadIdx.x;
  if ( (x < numx) && (y<numy) && (z<numz) ) {
    Lambda(x,y,z);
  }
@@ -402,10 +468,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
-inline void acceleratorCopySynchronise(void) {  }
+//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
+//inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}

+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
+{
+  hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
+}
+inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
+
 #endif

 //////////////////////////////////////////////
@@ -476,18 +548,12 @@ inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
-accelerator_inline void acceleratorSynchronise(void) 
+accelerator_inline void acceleratorSynchronise(void)  // Only Nvidia needs 
 {
 #ifdef GRID_SIMT
 #ifdef GRID_CUDA
  __syncwarp();
 #endif
-#ifdef GRID_SYCL
-  //cl::sycl::detail::workGroupBarrier();
-#endif
-#ifdef GRID_HIP
-  __syncthreads();
-#endif
 #endif
  return;
 }
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -88,7 +88,7 @@ public:
 // Coordinate class, maxdims = 8 for now.
 ////////////////////////////////////////////////////////////////
 #define GRID_MAX_LATTICE_DIMENSION (8)
-#define GRID_MAX_SIMD              (16)
+#define GRID_MAX_SIMD              (32)

 static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
  return;
 }

+void GridCmdOptionFloat(std::string &str,float & val)
+{
+  std::stringstream ss(str);
+  ss>>val;
+  return;
+}
+

 void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
@@ -527,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
 #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
+  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);
+void GridCmdOptionFloat(std::string &str,float & val);


 void GridParseLayout(char **argv,int argc,
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -0,0 +1,473 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/DWF2p1fIwasakiGparity.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//2+1f DWF+I ensemble with G-parity BCs
+//designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  RatQuoParameters, rat_quo_l,
+				  RatQuoParameters, rat_quo_s);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr){
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+      
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+}
+
+
+
+
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+    }
+  }
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "Params", user_params);
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+   // Typedefs to simplify notation
+  typedef GparityDomainWallFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityDomainWallFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.032;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+
+  //Setup the Grids
+  auto GridPtrD   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
+
+  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(GridPtrD);
+  LatticeGaugeFieldF Uf(GridPtrF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
+
+
+  /////////////////////////////////////////////////////////////
+  // Light action
+  /////////////////////////////////////////////////////////////
+
+  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
+  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
+
+  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
+  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
+
+  RationalActionParams rat_act_params_l;
+  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_l.precision= 60;
+  rat_act_params_l.MaxIter  = 10000;
+  user_params.rat_quo_l.Export(rat_act_params_l);
+  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+ 
+  MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
+  //DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
+  Level1.push_back(&Quotient_l);
+
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  //DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level2.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
+  std::string lanc_params_l, lanc_params_s;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
+    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
+    else if(sarg == "--eigenrange_l"){
+      assert(i < argc-1);
+      eigenrange_l=true;
+      lanc_params_l = argv[i+1];
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+  }
+  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
+
--- a/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
+++ b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
@@ -0,0 +1,473 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/DWF2p1fIwasakiGparity.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//2+1f DWF+I ensemble with G-parity BCs
+//designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  RatQuoParameters, rat_quo_l,
+				  RatQuoParameters, rat_quo_s);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr){
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+      
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+}
+
+
+
+
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+    }
+  }
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "Params", user_params);
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+   // Typedefs to simplify notation
+  typedef GparityDomainWallFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityDomainWallFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.032;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+
+  //Setup the Grids
+  auto GridPtrD   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
+
+  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(GridPtrD);
+  LatticeGaugeFieldF Uf(GridPtrF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
+
+
+  /////////////////////////////////////////////////////////////
+  // Light action
+  /////////////////////////////////////////////////////////////
+
+  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
+  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
+
+  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
+  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
+
+  RationalActionParams rat_act_params_l;
+  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_l.precision= 60;
+  rat_act_params_l.MaxIter  = 10000;
+  user_params.rat_quo_l.Export(rat_act_params_l);
+  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+ 
+  //MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
+  DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
+  Level1.push_back(&Quotient_l);
+
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level2.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
+  std::string lanc_params_l, lanc_params_s;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
+    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
+    else if(sarg == "--eigenrange_l"){
+      assert(i < argc-1);
+      eigenrange_l=true;
+      lanc_params_l = argv[i+1];
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+  }
+  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
+
--- a/HMC/Mobius2p1fIDSDRGparityEOFA.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA.cc
@@ -0,0 +1,765 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//We try to reproduce with G-parity BCs the 246 MeV 1.37 GeV ensemble
+//To speed things up we will use Mobius DWF with b+c=32/12 and Ls=12 to match the Ls=32 of the original
+//These parameters match those used in the 2020 K->pipi paper
+
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+struct EOFAparameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
+				  OneFlavourRationalParams, rat_params,
+				  double, action_tolerance,
+				  double, action_mixcg_inner_tolerance,
+				  double, md_tolerance,
+				  double, md_mixcg_inner_tolerance);
+
+  EOFAparameters() { 
+    action_mixcg_inner_tolerance = 1e-8;
+    action_tolerance = 1e-10;
+    md_tolerance = 1e-8;
+    md_mixcg_inner_tolerance = 1e-8;
+
+    rat_params.lo = 0.1;
+    rat_params.hi = 25.0;
+    rat_params.MaxIter  = 10000;
+    rat_params.tolerance= 1.0e-9;
+    rat_params.degree   = 14;
+    rat_params.precision= 50;
+  }
+};
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  EOFAparameters, eofa_l,
+				  RatQuoParameters, rat_quo_s,
+				  RatQuoParameters, rat_quo_DSDR);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+//action_or_md toggles checking the action (0), MD (1) or both (2) setups
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr, int action_or_md){
+  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+
+  PowerMethod<FermionFieldD> power_method;
+  RealD lambda_max;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
+
+  lambda_max = power_method(MdagM,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
+  lambda_max = power_method(VdagV,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  if(action_or_md == 0 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  if(action_or_md == 1 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+}
+
+
+template<typename FermionImplPolicy>
+void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  RealD scale = std::sqrt(0.5);
+  gaussian(rng,eta); eta = eta * scale;
+
+  //Use the inbuilt check
+  EOFA.refresh(latt, eta);
+  EOFA.S(latt);
+  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
+}
+
+
+template<typename FermionImplPolicy>
+class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
+  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
+}
+
+//Applications of M^{-1} cost the same as M for EOFA!
+template<typename FermionImplPolicy>
+class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
+  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
+}
+
+
+NAMESPACE_BEGIN(Grid);
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      MPCG.InnerTolerance = InnerTolerance;
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+NAMESPACE_END(Grid);
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+    }
+  }
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    {
+      Grid::XmlWriter wr(param_file + ".templ");
+      write(wr, "Params", user_params);
+    }
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+
+  typedef GparityMobiusEOFAFermionD EOFAactionD;
+  typedef GparityMobiusFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityMobiusEOFAFermionF EOFAactionF;
+  typedef GparityMobiusFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  Real beta         = 1.75;
+  Real light_mass   = 0.0042; //240 MeV
+  Real strange_mass = 0.045;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD mobius_scale = 32./12.; //b+c
+
+  RealD mob_bmc = 1.0;
+  RealD mob_b = (mobius_scale + mob_bmc)/2.;
+  RealD mob_c = (mobius_scale - mob_bmc)/2.;
+
+  //Setup the Grids
+  auto UGridD   = TheHMC.Resources.GetCartesian();
+  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(UGridD);
+  LatticeGaugeFieldF Uf(UGridF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(1); //DSDR
+  ActionLevel<HMCWrapper::Field> Level3(8); //gauge (8 increments per step)
+
+
+  /////////////////////////////////////////////////////////////
+  // Light EOFA action
+  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
+  /////////////////////////////////////////////////////////////
+
+  EOFAactionD LopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, light_mass, light_mass, pv_mass, 0.0, -1, M5, mob_b, mob_c, Params);
+  EOFAactionF LopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, light_mass, light_mass, pv_mass, 0.0, -1, M5, mob_b, mob_c, Params);
+  EOFAactionD RopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, pv_mass, light_mass, pv_mass, -1.0, 1, M5, mob_b, mob_c, Params);
+  EOFAactionF RopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, pv_mass, light_mass, pv_mass, -1.0, 1, M5, mob_b, mob_c, Params);
+
+  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
+  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
+  
+  EOFAschuropD linopL_D(LopD);
+  EOFAschuropD linopR_D(RopD);
+
+  EOFAschuropF linopL_F(LopF);
+  EOFAschuropF linopR_F(RopF);
+
+  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
+
+  EOFA_mxCG ActionMCG_L(user_params.eofa_l.action_tolerance, 10000, 1000, UGridF, FrbGridF, LopF, LopD, linopL_F, linopL_D);
+  ActionMCG_L.InnerTolerance = user_params.eofa_l.action_mixcg_inner_tolerance;
+  
+  EOFA_mxCG ActionMCG_R(user_params.eofa_l.action_tolerance, 10000, 1000, UGridF, FrbGridF, RopF, RopD, linopR_F, linopR_D);
+  ActionMCG_R.InnerTolerance = user_params.eofa_l.action_mixcg_inner_tolerance;
+
+  EOFA_mxCG DerivMCG_L(user_params.eofa_l.md_tolerance, 10000, 1000, UGridF, FrbGridF, LopF, LopD, linopL_F, linopL_D);
+  DerivMCG_L.InnerTolerance = user_params.eofa_l.md_mixcg_inner_tolerance;
+
+  EOFA_mxCG DerivMCG_R(user_params.eofa_l.md_tolerance, 10000, 1000, UGridF, FrbGridF, RopF, RopD, linopR_F, linopR_D);
+  DerivMCG_R.InnerTolerance = user_params.eofa_l.md_mixcg_inner_tolerance;
+
+  std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+  std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+  ConjugateGradient<FermionFieldD>      ActionCG(user_params.eofa_l.action_tolerance, 10000);
+  ConjugateGradient<FermionFieldD>  DerivativeCG(user_params.eofa_l.md_tolerance, 10000);
+
+  // ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicyD> EOFA(LopD, RopD, 
+  // 								   ActionCG, ActionCG, ActionCG, 
+  // 								   DerivativeCG, DerivativeCG, 
+  // 								   user_params.eofa_l.rat_params, true);
+
+  // ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicyD> EOFA(LopD, RopD, 
+  // 								   ActionMCG_L, ActionMCG_R, 
+  // 								   ActionMCG_L, ActionMCG_R, 
+  // 								   DerivMCG_L, DerivMCG_R, 
+  // 								   user_params.eofa_l.rat_params, true);
+
+  ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFA(LopF, RopF,
+													LopD, RopD, 
+													ActionMCG_L, ActionMCG_R, 
+													ActionMCG_L, ActionMCG_R, 
+													DerivMCG_L, DerivMCG_R, 
+													user_params.eofa_l.rat_params, true);
+
+
+  Level1.push_back(&EOFA);
+
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+  ///////////////////////////////////
+  // DSDR action
+  ///////////////////////////////////
+  RealD dsdr_mass=-1.8;   
+  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
+  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
+  RealD dsdr_epsilon_b = 0.5; 
+  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
+  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
+
+  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
+  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
+ 
+  RationalActionParams rat_act_params_DSDR;
+  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_DSDR.precision= 60;
+  rat_act_params_DSDR.MaxIter  = 10000;
+  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
+  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
+  Level2.push_back(&Quotient_DSDR);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool 
+    tune_rhmc_s=false, eigenrange_s=false, 
+    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
+    check_eofa=false, 
+    upper_bound_eofa=false, lower_bound_eofa(false);
+
+  std::string lanc_params_s;
+  std::string lanc_params_DSDR;
+  int tune_rhmc_s_action_or_md;
+  int tune_rhmc_DSDR_action_or_md;
+
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_s"){
+      assert(i < argc-1);
+      tune_rhmc_s=true;
+      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+    else if(sarg == "--tune_rhmc_DSDR"){
+      assert(i < argc-1);
+      tune_rhmc_DSDR=true;
+      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_DSDR"){
+      assert(i < argc-1);
+      eigenrange_DSDR=true;
+      lanc_params_DSDR = argv[i+1];
+    }
+    else if(sarg == "--check_eofa") check_eofa = true;
+    else if(sarg == "--upper_bound_eofa") upper_bound_eofa = true;
+    else if(sarg == "--lower_bound_eofa") lower_bound_eofa = true;
+  }
+  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
+    std::cout << GridLogMessage << "Running checks" << std::endl;
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+
+    std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+    std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+
+    if(check_eofa) checkEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(upper_bound_eofa) upperBoundEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(lower_bound_eofa) lowerBoundEOFA(EOFA, FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
+    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
+
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
@@ -0,0 +1,918 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//Production binary for the 40ID G-parity ensemble
+
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+struct EOFAparameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
+				  OneFlavourRationalParams, rat_params,
+				  double, action_tolerance,
+				  double, action_mixcg_inner_tolerance,
+				  double, md_tolerance,
+				  double, md_mixcg_inner_tolerance);
+
+  EOFAparameters() { 
+    action_mixcg_inner_tolerance = 1e-8;
+    action_tolerance = 1e-10;
+    md_tolerance = 1e-8;
+    md_mixcg_inner_tolerance = 1e-8;
+
+    rat_params.lo = 1.0;
+    rat_params.hi = 25.0;
+    rat_params.MaxIter  = 50000;
+    rat_params.tolerance= 1.0e-9;
+    rat_params.degree   = 14;
+    rat_params.precision= 50;
+  }
+};
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+				  RealD, TrajectoryLength,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  std::vector<EOFAparameters>, eofa_l,
+				  RatQuoParameters, rat_quo_s,
+				  RatQuoParameters, rat_quo_DSDR);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+    TrajectoryLength = 1.0;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 50000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+//action_or_md toggles checking the action (0), MD (1) or both (2) setups
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr, int action_or_md){
+  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+
+  PowerMethod<FermionFieldD> power_method;
+  RealD lambda_max;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
+
+  lambda_max = power_method(MdagM,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
+  lambda_max = power_method(VdagV,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  if(action_or_md == 0 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  if(action_or_md == 1 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+}
+
+
+template<typename FermionImplPolicy>
+void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  RealD scale = std::sqrt(0.5);
+  gaussian(rng,eta); eta = eta * scale;
+
+  //Use the inbuilt check
+  EOFA.refresh(latt, eta);
+  EOFA.S(latt);
+  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
+}
+
+
+template<typename FermionImplPolicy>
+class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
+  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
+}
+
+//Applications of M^{-1} cost the same as M for EOFA!
+template<typename FermionImplPolicy>
+class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
+  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
+}
+
+
+NAMESPACE_BEGIN(Grid);
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      MPCG.InnerTolerance = InnerTolerance;
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD Tolerance;
+    Integer MaxIterations;
+
+    RealD Delta; //reliable update parameter
+
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+    
+    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
+								  RealD delta,
+								  Integer maxit, 
+								  GridBase* _sp_grid4, 
+								  GridBase* _sp_grid5, 
+								  FermionOperatorF &_FermOpF,
+								  FermionOperatorD &_FermOpD,
+								  SchurOperatorF   &_LinOpF,
+								  SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      Delta(delta),
+      MaxIterations(maxit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5)
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+NAMESPACE_END(Grid);
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+
+  std::string serial_seeds = "1 2 3 4 5";
+  std::string parallel_seeds = "6 7 8 9 10";
+
+  int i=1;
+  while(i < argc){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+      i+=2;
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+      i++;
+    }else if(sarg == "--set_seeds"){ //set the rng seeds. Expects two vector args, e.g.  --set_seeds 1.2.3.4 5.6.7.8
+      assert(i < argc-2);
+      std::vector<int> tmp;
+      GridCmdOptionIntVector(argv[i+1],tmp);
+      {
+	std::stringstream ss;
+	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
+	ss << tmp.back();
+	serial_seeds = ss.str();
+      }
+      GridCmdOptionIntVector(argv[i+2],tmp);
+      {
+	std::stringstream ss;
+	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
+	ss << tmp.back();
+	parallel_seeds = ss.str();
+      }
+      i+=3;
+      std::cout << GridLogMessage << "Set serial seeds to " << serial_seeds << std::endl;
+      std::cout << GridLogMessage << "Set parallel seeds to " << parallel_seeds << std::endl;
+      
+    }else{
+      i++;
+    }
+  }
+
+  
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    {
+      Grid::XmlWriter wr(param_file + ".templ");
+      write(wr, "Params", user_params);
+    }
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+
+  typedef GparityMobiusEOFAFermionD EOFAactionD;
+  typedef GparityMobiusFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityMobiusEOFAFermionF EOFAactionF;
+  typedef GparityMobiusFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  MD.name    = std::string("MinimumNorm2");
+
+  // typedef ConjugateHMCRunnerD<ForceGradient> HMCWrapper;
+  // MD.name    = std::string("ForceGradient");
+  
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = user_params.TrajectoryLength;
+
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = serial_seeds;
+  RNGpar.parallel_seeds = parallel_seeds;
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+  //aiming for ainv=1.723 GeV
+  //                                  me         bob
+  //Estimated  a(ml+mres) [40ID] = 0.001305    0.00131
+  //           a(mh+mres) [40ID] = 0.035910    0.03529
+  //Estimate Ls=12, b+c=2  mres~0.0011
+
+  //1/24/2022 initial mres measurement gives mres=0.001,  adjusted light quark mass to 0.0003 from 0.0001
+  
+  const int Ls      = 12;
+  Real beta         = 1.848;
+  Real light_mass   = 0.0003;
+  Real strange_mass = 0.0342;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD mobius_scale = 2.; //b+c
+
+  RealD mob_bmc = 1.0;
+  RealD mob_b = (mobius_scale + mob_bmc)/2.;
+  RealD mob_c = (mobius_scale - mob_bmc)/2.;
+
+  std::cout << GridLogMessage
+	    << "Ensemble parameters:" << std::endl
+	    << "Ls=" << Ls << std::endl
+	    << "beta=" << beta << std::endl
+	    << "light_mass=" << light_mass << std::endl
+	    << "strange_mass=" << strange_mass << std::endl
+	    << "mobius_scale=" << mobius_scale << std::endl;
+  
+  //Setup the Grids
+  auto UGridD   = TheHMC.Resources.GetCartesian();
+  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(UGridD);
+  LatticeGaugeFieldF Uf(UGridF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
+  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
+
+
+  /////////////////////////////////////////////////////////////
+  // Light EOFA action
+  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
+  /////////////////////////////////////////////////////////////
+  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
+  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
+  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
+  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
+
+
+  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
+  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
+  int n_light_hsb = 5;
+  assert(user_params.eofa_l.size() == n_light_hsb);
+  
+  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
+
+  for(int i=0;i<n_light_hsb;i++){
+    RealD iml = eofa_light_masses[i];
+    RealD ipv = eofa_pv_masses[i];
+
+    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+
+    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
+    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
+    
+    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
+    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
+
+#if 1
+    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
+    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+#else
+    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
+    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
+#endif
+
+    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
+							*LopD, *RopD, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*DerivMCG_L, *DerivMCG_R, 
+							user_params.eofa_l[i].rat_params, true);
+    EOFA_pfactions[i] = EOFA;
+    Level1.push_back(EOFA);
+  }
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 50000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+  ///////////////////////////////////
+  // DSDR action
+  ///////////////////////////////////
+  RealD dsdr_mass=-1.8;   
+  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
+  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
+  RealD dsdr_epsilon_b = 0.5; 
+  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
+  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
+
+  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
+  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
+ 
+  RationalActionParams rat_act_params_DSDR;
+  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_DSDR.precision= 60;
+  rat_act_params_DSDR.MaxIter  = 50000;
+  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
+  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
+  Level2.push_back(&Quotient_DSDR);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool 
+    tune_rhmc_s=false, eigenrange_s=false, 
+    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
+    check_eofa=false, 
+    upper_bound_eofa=false, lower_bound_eofa(false);
+
+  std::string lanc_params_s;
+  std::string lanc_params_DSDR;
+  int tune_rhmc_s_action_or_md;
+  int tune_rhmc_DSDR_action_or_md;
+  int eofa_which_hsb;
+
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_s"){
+      assert(i < argc-1);
+      tune_rhmc_s=true;
+      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+    else if(sarg == "--tune_rhmc_DSDR"){
+      assert(i < argc-1);
+      tune_rhmc_DSDR=true;
+      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_DSDR"){
+      assert(i < argc-1);
+      eigenrange_DSDR=true;
+      lanc_params_DSDR = argv[i+1];
+    }
+    else if(sarg == "--check_eofa"){
+      assert(i < argc-1);
+      check_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
+      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+    }
+    else if(sarg == "--upper_bound_eofa"){
+      assert(i < argc-1);
+      upper_bound_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+    else if(sarg == "--lower_bound_eofa"){
+      assert(i < argc-1);
+      lower_bound_eofa = true;      
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+  }
+  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
+    std::cout << GridLogMessage << "Running checks" << std::endl;
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+
+    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+    if(check_eofa){
+      if(eofa_which_hsb >= 0){
+	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+      }else{
+	for(int i=0;i<n_light_hsb;i++){
+	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
+	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
+	}
+      }
+    }	  
+    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
+    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
+
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
@@ -0,0 +1,873 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//Production binary for the 40ID G-parity ensemble
+
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+struct EOFAparameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
+				  OneFlavourRationalParams, rat_params,
+				  double, action_tolerance,
+				  double, action_mixcg_inner_tolerance,
+				  double, md_tolerance,
+				  double, md_mixcg_inner_tolerance);
+
+  EOFAparameters() { 
+    action_mixcg_inner_tolerance = 1e-8;
+    action_tolerance = 1e-10;
+    md_tolerance = 1e-8;
+    md_mixcg_inner_tolerance = 1e-8;
+
+    rat_params.lo = 1.0;
+    rat_params.hi = 25.0;
+    rat_params.MaxIter  = 10000;
+    rat_params.tolerance= 1.0e-9;
+    rat_params.degree   = 14;
+    rat_params.precision= 50;
+  }
+};
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+				  RealD, TrajectoryLength,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  std::vector<EOFAparameters>, eofa_l,
+				  RatQuoParameters, rat_quo_s,
+				  RatQuoParameters, rat_quo_DSDR);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+    TrajectoryLength = 1.0;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+//action_or_md toggles checking the action (0), MD (1) or both (2) setups
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr, int action_or_md){
+  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+
+  PowerMethod<FermionFieldD> power_method;
+  RealD lambda_max;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
+
+  lambda_max = power_method(MdagM,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
+  lambda_max = power_method(VdagV,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  if(action_or_md == 0 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  if(action_or_md == 1 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+}
+
+
+template<typename FermionImplPolicy>
+void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  RealD scale = std::sqrt(0.5);
+  gaussian(rng,eta); eta = eta * scale;
+
+  //Use the inbuilt check
+  EOFA.refresh(latt, eta);
+  EOFA.S(latt);
+  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
+}
+
+
+template<typename FermionImplPolicy>
+class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
+  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
+}
+
+//Applications of M^{-1} cost the same as M for EOFA!
+template<typename FermionImplPolicy>
+class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
+  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
+}
+
+
+NAMESPACE_BEGIN(Grid);
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      MPCG.InnerTolerance = InnerTolerance;
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD Tolerance;
+    Integer MaxIterations;
+
+    RealD Delta; //reliable update parameter
+
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+    
+    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
+								  RealD delta,
+								  Integer maxit, 
+								  GridBase* _sp_grid4, 
+								  GridBase* _sp_grid5, 
+								  FermionOperatorF &_FermOpF,
+								  FermionOperatorD &_FermOpD,
+								  SchurOperatorF   &_LinOpF,
+								  SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      Delta(delta),
+      MaxIterations(maxit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5)
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+NAMESPACE_END(Grid);
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+    }
+  }
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    {
+      Grid::XmlWriter wr(param_file + ".templ");
+      write(wr, "Params", user_params);
+    }
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+
+  typedef GparityMobiusEOFAFermionD EOFAactionD;
+  typedef GparityMobiusFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityMobiusEOFAFermionF EOFAactionF;
+  typedef GparityMobiusFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = user_params.TrajectoryLength;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  //aiming for ainv=2.068             me          Bob
+  //Estimated  a(ml+mres) [48ID] = 0.001048    0.00104 
+  //           a(mh+mres) [48ID] = 0.028847    0.02805
+  //Estimate Ls=12, b+c=2  mres~0.0003
+
+  const int Ls      = 12;
+  Real beta         = 1.946;
+  Real light_mass   = 0.00074;   //0.00104 - mres_approx;
+  Real strange_mass = 0.02775;    //0.02805 - mres_approx
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD mobius_scale = 2.; //b+c
+
+  RealD mob_bmc = 1.0;
+  RealD mob_b = (mobius_scale + mob_bmc)/2.;
+  RealD mob_c = (mobius_scale - mob_bmc)/2.;
+
+  //Setup the Grids
+  auto UGridD   = TheHMC.Resources.GetCartesian();
+  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(UGridD);
+  LatticeGaugeFieldF Uf(UGridF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
+  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
+
+
+  /////////////////////////////////////////////////////////////
+  // Light EOFA action
+  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
+  /////////////////////////////////////////////////////////////
+  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
+  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
+  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
+  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
+  
+  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
+  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
+  int n_light_hsb = 5;
+  assert(user_params.eofa_l.size() == n_light_hsb);
+  
+  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
+
+  for(int i=0;i<n_light_hsb;i++){
+    RealD iml = eofa_light_masses[i];
+    RealD ipv = eofa_pv_masses[i];
+
+    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+
+    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
+    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
+    
+    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
+    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
+
+#if 1
+    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
+    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+#else
+    
+    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
+    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
+#endif
+
+    
+    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
+							*LopD, *RopD, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*DerivMCG_L, *DerivMCG_R, 
+							user_params.eofa_l[i].rat_params, true);
+    EOFA_pfactions[i] = EOFA;
+    Level1.push_back(EOFA);
+  }
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+  ///////////////////////////////////
+  // DSDR action
+  ///////////////////////////////////
+  RealD dsdr_mass=-1.8;   
+  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
+  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
+  RealD dsdr_epsilon_b = 0.5; 
+  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
+  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
+
+  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
+  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
+ 
+  RationalActionParams rat_act_params_DSDR;
+  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_DSDR.precision= 60;
+  rat_act_params_DSDR.MaxIter  = 10000;
+  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
+  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
+  Level2.push_back(&Quotient_DSDR);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool 
+    tune_rhmc_s=false, eigenrange_s=false, 
+    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
+    check_eofa=false, 
+    upper_bound_eofa=false, lower_bound_eofa(false);
+
+  std::string lanc_params_s;
+  std::string lanc_params_DSDR;
+  int tune_rhmc_s_action_or_md;
+  int tune_rhmc_DSDR_action_or_md;
+  int eofa_which_hsb;
+
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_s"){
+      assert(i < argc-1);
+      tune_rhmc_s=true;
+      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+    else if(sarg == "--tune_rhmc_DSDR"){
+      assert(i < argc-1);
+      tune_rhmc_DSDR=true;
+      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_DSDR"){
+      assert(i < argc-1);
+      eigenrange_DSDR=true;
+      lanc_params_DSDR = argv[i+1];
+    }
+    else if(sarg == "--check_eofa"){
+      assert(i < argc-1);
+      check_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
+      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+    }
+    else if(sarg == "--upper_bound_eofa"){
+      assert(i < argc-1);
+      upper_bound_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+    else if(sarg == "--lower_bound_eofa"){
+      assert(i < argc-1);
+      lower_bound_eofa = true;      
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+  }
+  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
+    std::cout << GridLogMessage << "Running checks" << std::endl;
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+
+    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+
+    if(check_eofa){
+      if(eofa_which_hsb >= 0){
+	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+      }else{
+	for(int i=0;i<n_light_hsb;i++){
+	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
+	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
+	}
+      }
+    }	  
+    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
+    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
+
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@@ -137,7 +137,7 @@ int main (int argc, char ** argv)

  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
-  double          n = BENCH_IO_NPASS;
+  //  double          n = BENCH_IO_NPASS;

  stats(mean, stdDev, perf);
  stats(avMean, avStdDev, avPerf);
@@ -164,7 +164,7 @@ int main (int argc, char ** argv)
                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s\n",
              "L", "std read", "std write", "Grid read", "Grid write");
@@ -185,7 +185,7 @@ int main (int argc, char ** argv)
              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
  MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%12s %12s %12s %12s\n",
              "std read", "std write", "Grid read", "Grid write");
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -142,7 +142,7 @@ public:
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}

-	int ncomm;
+	//	int ncomm;
 	double dbytes;

        for(int dir=0;dir<8;dir++) {
@@ -290,7 +290,7 @@ public:
      LatticeSU4 z(&Grid); z=Zero();
      LatticeSU4 x(&Grid); x=Zero();
      LatticeSU4 y(&Grid); y=Zero();
-      double a=2.0;
+      //      double a=2.0;

      uint64_t Nloop=NLOOP;

--- a/benchmarks/Benchmark_comms_host_device.cc
+++ b/benchmarks/Benchmark_comms_host_device.cc
@@ -72,7 +72,7 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
-  time_statistics timestat;
+  //  time_statistics timestat;

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -126,19 +126,10 @@ int main (int argc, char ** argv)
  // Naive wilson implementation
  ////////////////////////////////////
  // replicate across fifth dimension
-  LatticeGaugeFieldF Umu5d(FGrid);
-  std::vector<LatticeColourMatrixF> U(4,FGrid);
-  {
-    autoView( Umu5d_v, Umu5d, CpuWrite);
-    autoView( Umu_v  , Umu  , CpuRead);
-    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
-      for(int s=0;s<Ls;s++){
-	Umu5d_v[Ls*ss+s] = Umu_v[ss];
-      }
-    }
-  }
+  //  LatticeGaugeFieldF Umu5d(FGrid);
+  std::vector<LatticeColourMatrixF> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;

@@ -147,10 +138,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){

-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

-      tmp =adj(U[mu])*src;
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@@ -182,7 +191,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =3000;
+  int ncall =300;

  if (1) {
    FGrid->Barrier();
@@ -242,16 +251,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){

      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
 	}
      }
      
-      tmp =adj(U[mu])*src;
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -184,8 +184,10 @@ int main (int argc, char ** argv)
      
      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
-
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
+	       <<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
+	       <<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
+      assert(nn==nn);
  }    

  Grid_finalize();
--- a/examples/Example_Laplacian_solver.cc
+++ b/examples/Example_Laplacian_solver.cc
@@ -4,7 +4,7 @@ using namespace Grid;
 template<class Field>
 void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
 {
-    RealD cp, c, alpha, d, beta, ssq, qq;
+    RealD cp, c, alpha, d, beta, ssq;
    RealD Tolerance=1.0e-10;
    int MaxIterations=10000;
    
--- a/examples/Example_wall_wall_3pt.cc
+++ b/examples/Example_wall_wall_3pt.cc
@@ -0,0 +1,539 @@
+/*
+ * Warning: This code illustrative only: not well tested, and not meant for production use
+ * without regression / tests being applied
+ */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+typedef SpinColourMatrix Propagator;
+typedef SpinColourVector Fermion;
+typedef PeriodicGimplR   GimplR;
+
+template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
+{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  GridBase *grid;
+  GaugeField U;
+  
+  CovariantLaplacianCshift(GaugeField &_U)    :
+    grid(_U.Grid()),
+    U(_U) {  };
+
+  virtual GridBase *Grid(void) { return grid; };
+
+  virtual void  M    (const Field &in, Field &out)
+  {
+    out=Zero();
+    for(int mu=0;mu<Nd-1;mu++) {
+      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
+      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
+      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
+      out = out + 2.0*in;
+    }
+  };
+  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
+  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
+};
+
+void MakePhase(Coordinate mom,LatticeComplex &phase)
+{
+  GridBase *grid = phase.Grid();
+  auto latt_size = grid->GlobalDimensions();
+  ComplexD ci(0.0,1.0);
+  phase=Zero();
+
+  LatticeComplex coor(phase.Grid());
+  for(int mu=0;mu<Nd;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    phase = phase + (TwoPiL * mom[mu]) * coor;
+  }
+  phase = exp(phase*ci);
+}
+void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
+{
+  Smear_Stout<GimplR> Stout(rho);
+  LatticeGaugeField Utmp(Uin.Grid());
+  Utmp = Uin;
+  for(int i=0;i<nstep;i++){
+    Stout.smear(Usmr,Utmp);
+    Utmp = Usmr;
+  }
+}
+void PointSource(Coordinate &coor,LatticePropagator &source)
+{
+  //  Coordinate coor({0,0,0,0});
+  source=Zero();
+  SpinColourMatrix kronecker; kronecker=1.0;
+  pokeSite(kronecker,source,coor);
+}
+void GFWallSource(int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex one(grid); one = ComplexD(1.0,0.0);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+  LatticeCoordinate(t,Tdir);
+  one = where(t==Integer(tslice), one, zz);
+  source = 1.0;
+  source = source * one;
+}
+
+void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
+{
+  GridBase *grid = source.Grid();
+  LatticeComplex noise(grid);
+  LatticeComplex zz(grid); zz=Zero();
+  LatticeInteger t(grid);
+
+  RealD nrm=1.0/sqrt(2);
+  bernoulli(RNG, noise); // 0,1 50:50
+
+  noise = (2.*noise - Complex(1,1))*nrm;
+
+  LatticeCoordinate(t,Tdir);
+  noise = where(t==Integer(tslice), noise, zz);
+
+  source = 1.0;
+  source = source*noise;
+  std::cout << " Z2 wall " << norm2(source) << std::endl;
+}
+void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
+{
+  Real alpha=0.05;
+
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
+
+  std::cout << " Initial plaquette "<<plaq << std::endl;
+
+  LatticeColourMatrix   xform(U.Grid()); 
+  Ufix = U;
+  int orthog=Nd-1;
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
+  
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
+
+  std::cout << " Final plaquette "<<plaq << std::endl;
+}
+template<class Field>
+void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
+{
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
+  Laplacian_t Laplacian(U);
+
+  Integer Iterations = 40;
+  Real width = 2.0;
+  Real coeff = (width*width) / Real(4*Iterations);
+
+  Field tmp(U.Grid());
+  smeared=unsmeared;
+  //  chi = (1-p^2/2N)^N kronecker
+  for(int n = 0; n < Iterations; ++n) {
+    Laplacian.M(smeared,tmp);
+    smeared = smeared - coeff*tmp;
+    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
+  }
+}
+void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
+{
+  LatticePropagator tmp(source.Grid());
+  PointSource(site,source);
+  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
+  tmp = source;
+  GaussianSmear(U,tmp,source);
+  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
+}
+void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
+{
+  Z2WallSource(RNG,tslice,source);
+  auto tmp = source;
+  GaussianSmear(U,tmp,source);
+}
+void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
+{
+  assert(mom.size()==Nd);
+  assert(mom[Tdir] == 0);
+
+  GridBase * grid = spectator.Grid();
+
+  LatticeInteger ts(grid);
+  LatticeCoordinate(ts,Tdir);
+  source = Zero();
+  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
+
+  LatticeComplex phase(grid);
+  MakePhase(mom,phase);
+
+  source = source *phase;
+}
+template<class Action>
+void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
+{
+  GridBase *UGrid = D.GaugeGrid();
+  GridBase *FGrid = D.FermionGrid();
+
+  LatticeFermion src4  (UGrid); 
+  LatticeFermion src5  (FGrid); 
+  LatticeFermion result5(FGrid);
+  LatticeFermion result4(UGrid);
+  
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
+  for(int s=0;s<Nd;s++){
+    for(int c=0;c<Nc;c++){
+      PropToFerm<Action>(src4,source,s,c);
+
+      D.ImportPhysicalFermionSource(src4,src5);
+
+      result5=Zero();
+      schur(D,src5,result5,ZG);
+      std::cout<<GridLogMessage
+	       <<"spin "<<s<<" color "<<c
+	       <<" norm2(src5d) "   <<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
+
+      D.ExportPhysicalFermionSolution(result5,result4);
+
+      FermToProp<Action>(propagator,result4,s,c);
+    }
+  }
+}
+
+class MesonFile: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
+};
+
+void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
+  };
+
+  Gamma G5(Gamma::Algebra::Gamma5);
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+void Meson3pt(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaX},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaY},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaZ},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaT}
+  };
+
+  Gamma G5(Gamma::Algebra::Gamma5);
+
+  LatticeComplex meson_CF(q1.Grid());
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
+
+    std::vector<TComplex> meson_T;
+    sliceSum(meson_CF,meson_T, Tdir);
+
+    int nt=meson_T.size();
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+
+
+void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
+{
+  const int nchannel=4;
+  Gamma::Algebra Gammas[nchannel][2] = {
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
+    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
+    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
+  };
+
+  Gamma G5(Gamma::Algebra::Gamma5);
+  int nt=q1.size();
+  std::vector<Complex> meson_CF(nt);
+  MesonFile MF;
+
+  for(int ch=0;ch<nchannel;ch++){
+
+    Gamma Gsrc(Gammas[ch][0]);
+    Gamma Gsnk(Gammas[ch][1]);
+
+    std::vector<Complex> corr(nt);
+    for(int t=0;t<nt;t++){
+      meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
+      corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
+      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
+    }
+    MF.data.push_back(corr);
+  }
+
+  {
+    XmlWriter WR(file);
+    write(WR,"MesonFile",MF);
+  }
+}
+int make_idx(int p, int m,int nmom)
+{
+  if (m==0) return p;
+  assert(p==0);
+  return nmom + m - 1;
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  // Double precision grids
+  auto latt = GridDefaultLatt();
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+
+  LatticeGaugeField Umu(UGrid);
+  LatticeGaugeField Utmp(UGrid);
+  LatticeGaugeField Usmr(UGrid);
+  std::string config;
+  if( argc > 1 && argv[1][0] != '-' )
+  {
+    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, argv[1]);
+    config=argv[1];
+  }
+  else
+  {
+    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
+    SU<Nc>::ColdConfiguration(Umu);
+    config="ColdConfig";
+  }
+  //  GaugeFix(Umu,Utmp);
+  //  Umu=Utmp;
+
+  int nsmr=3;
+  RealD rho=0.1;
+  LinkSmear(nsmr,rho,Umu,Usmr);
+
+
+  std::vector<int>   smeared_link({ 0,0,1} ); 
+  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
+  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
+  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
+  std::vector<RealD> cs   ({ 0.0,0.0,0.5} );  // DDM
+  std::vector<int>   Ls_s ({ 16,16,12} );
+  std::vector<GridCartesian *> FGrids;
+  std::vector<GridRedBlackCartesian *> FrbGrids;
+
+  std::vector<Coordinate> momenta;
+  momenta.push_back(Coordinate({0,0,0,0}));
+  momenta.push_back(Coordinate({1,0,0,0}));
+  momenta.push_back(Coordinate({2,0,0,0}));
+
+  int nmass = masses.size();
+  int nmom  = momenta.size();
+
+  std::vector<MobiusFermionR *> FermActs;
+  
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+  typedef MobiusFermionR FermionAction;
+  FermionAction::ImplParams Params(boundary);
+
+  for(int m=0;m<masses.size();m++) {
+
+    RealD mass = masses[m];
+    RealD M5   = M5s[m];
+    RealD b    = bs[m];
+    RealD c    = cs[m];
+    int   Ls   = Ls_s[m];
+
+    if ( smeared_link[m] ) Utmp = Usmr;
+    else                   Utmp = Umu;
+    
+    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
+    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
+
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
+  }
+
+  LatticePropagator z2wall_source(UGrid);
+  LatticePropagator gfwall_source(UGrid);
+  LatticePropagator phased_prop(UGrid);
+
+  int tslice = 0;
+  int tseq=(tslice+16)%latt[Nd-1];
+  //////////////////////////////////////////////////////////////////////
+  // RNG seeded for Z2 wall
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
+  Z2WallSource  (RNG4,tslice,z2wall_source);
+  GFWallSource  (tslice,gfwall_source);
+
+  std::vector<LatticeComplex> phase(nmom,UGrid);
+  for(int m=0;m<nmom;m++){
+    MakePhase(momenta[m],phase[m]);
+  }
+
+  std::vector<LatticePropagator> Z2Props   (nmom+nmass-1,UGrid);
+  std::vector<LatticePropagator> GFProps   (nmom+nmass-1,UGrid);
+  for(int p=0;p<nmom;p++) {
+    int m=0;
+    int idx = make_idx(p,m,nmom);
+    phased_prop = z2wall_source * phase[p];
+    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
+
+    phased_prop = gfwall_source * phase[p];
+    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
+  }
+  for(int m=1;m<nmass;m++) {
+    int p=0;
+    int idx = make_idx(p,m,nmom);
+    phased_prop = z2wall_source;
+    Solve(*FermActs[m],phased_prop  ,Z2Props[idx]);
+
+    phased_prop = gfwall_source;
+    Solve(*FermActs[m],phased_prop  ,GFProps[idx]);
+  }
+
+  std::vector<std::vector<Propagator> > wsnk_z2Props(nmom+nmass-1);
+  std::vector<std::vector<Propagator> > wsnk_gfProps(nmom+nmass-1);
+
+  // Non-zero kaon and point and D two point
+  // WW stick momentum on m1 (lighter)
+  //     zero momentum on m2
+  for(int m1=0;m1<nmass;m1++) {
+  for(int m2=m1;m2<nmass;m2++) {
+    int pmax = (m1==0)? nmom:1;
+    for(int p=0;p<pmax;p++){
+
+      std::stringstream ssg,ssz;
+      std::stringstream wssg,wssz;
+
+      int idx1 = make_idx(p,m1,nmom);
+      int idx2 = make_idx(0,m2,nmom);
+
+      /// Point sinks
+      ssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
+      ssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
+      MesonTrace(ssz.str(),Z2Props[idx1],Z2Props[idx2],phase[p]); // Q1 is conjugated
+      MesonTrace(ssg.str(),GFProps[idx1],GFProps[idx2],phase[p]); 
+      
+      /// Wall sinks
+      wssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
+      wssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
+      
+      phased_prop = GFProps[m2] * phase[p];
+      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
+      sliceSum(GFProps[m1],wsnk_gfProps[m2],Tdir);
+      WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
+
+      phased_prop = Z2Props[m2] * phase[p];
+      sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
+      sliceSum(Z2Props[m1],wsnk_gfProps[m2],Tdir);
+      WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
+    }
+  }}
+
+
+  /////////////////////////////////////
+  // Sequential solves
+  /////////////////////////////////////
+  LatticePropagator  seq_wsnk_z2src(UGrid);
+  LatticePropagator  seq_wsnk_gfsrc(UGrid);
+  LatticePropagator  seq_psnk_z2src(UGrid);
+  LatticePropagator  seq_psnk_gfsrc(UGrid);
+  LatticePropagator source(UGrid);
+  for(int m=0;m<nmass-1;m++){
+    int spect_idx = make_idx(0,m,nmom);
+    int charm=nmass-1;
+
+    SequentialSource(tseq,momenta[0],GFProps[spect_idx],source);
+    Solve(*FermActs[charm],source,seq_psnk_gfsrc);
+    
+    SequentialSource(tseq,momenta[0],Z2Props[spect_idx],source);
+    Solve(*FermActs[charm],source,seq_psnk_z2src);
+
+    // Todo need wall sequential solve
+    for(int p=0;p<nmom;p++){
+      int active_idx = make_idx(p,0,nmom);
+      std::stringstream seq_3pt_p_z2;
+      std::stringstream seq_3pt_p_gf;
+      std::stringstream seq_3pt_w_z2;
+      std::stringstream seq_3pt_w_gf;
+      seq_3pt_p_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_z2_meson.xml";
+      seq_3pt_p_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_p_gf_meson.xml";
+      seq_3pt_w_z2  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_z2_meson.xml";
+      seq_3pt_w_gf  <<config<<"_3pt_p"<<p<< "_m" << m << "_w_gf_meson.xml";
+      Meson3pt(seq_3pt_p_gf.str(),GFProps[active_idx],seq_psnk_gfsrc,phase[p]);
+      Meson3pt(seq_3pt_p_z2.str(),Z2Props[active_idx],seq_psnk_z2src,phase[p]);
+    }    
+  }
+  
+  Grid_finalize();
+}
+
+
+
--- a/examples/Example_wall_wall_spectrum.cc
+++ b/examples/Example_wall_wall_spectrum.cc
@@ -9,6 +9,7 @@ using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
+typedef PeriodicGimplR   GimplR;

 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
@@ -55,6 +56,16 @@ void MakePhase(Coordinate mom,LatticeComplex &phase)
  }
  phase = exp(phase*ci);
 }
+void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
+{
+  Smear_Stout<GimplR> Stout(rho);
+  LatticeGaugeField Utmp(Uin.Grid());
+  Utmp = Uin;
+  for(int i=0;i<nstep;i++){
+    Stout.smear(Usmr,Utmp);
+    Utmp = Usmr;
+  }
+}
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
@@ -97,23 +108,23 @@ void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;

-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);

  std::cout << " Initial plaquette "<<plaq << std::endl;

  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
  
-  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Ufix);
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);

  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);

  Integer Iterations = 40;
@@ -167,19 +178,21 @@ void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();

-  LatticeFermion src4  (UGrid); 
+  LatticeFermion src4  (UGrid); src4 = Zero();
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
  
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
+  std::cout<<GridLogMessage<< " source4 "<<norm2(source)<<std::endl;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
-
+      std::cout<<GridLogMessage<< s<<c<<" src4 "<<norm2(src4)<<std::endl;
      D.ImportPhysicalFermionSource(src4,src5);
+      std::cout<<GridLogMessage<< s<<c<<" src5 "<<norm2(src5)<<std::endl;

      result5=Zero();
      schur(D,src5,result5,ZG);
@@ -287,15 +300,10 @@ int main (int argc, char ** argv)
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);

-  //////////////////////////////////////////////////////////////////////
-  // You can manage seeds however you like.
-  // Recommend SeedUniqueString.
-  //////////////////////////////////////////////////////////////////////
-  std::vector<int> seeds4({1,2,3,4}); 
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

  LatticeGaugeField Umu(UGrid);
-  LatticeGaugeField Ufixed(UGrid);
+  LatticeGaugeField Utmp(UGrid);
+  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
@@ -308,13 +316,20 @@ int main (int argc, char ** argv)
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
-    //    SU<Nc>::HotConfiguration(RNG4,Umu);
-    config="HotConfig";
+    config="ColdConfig";
  }
-  GaugeFix(Umu,Ufixed);
-  Umu=Ufixed;
+  //  GaugeFix(Umu,Utmp);
+  //  Umu=Utmp;

+  int nsmr=3;
+  RealD rho=0.1;
+  RealD plaq_gf =WilsonLoops<GimplR>::avgPlaquette(Umu);
+  LinkSmear(nsmr,rho,Umu,Usmr);
+  RealD plaq_smr=WilsonLoops<GimplR>::avgPlaquette(Usmr);
+  std::cout << GridLogMessage << " GF Plaquette " <<plaq_gf<<std::endl;
+  std::cout << GridLogMessage << " SM Plaquette " <<plaq_smr<<std::endl;

+  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
@@ -330,6 +345,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
+  std::vector<Complex> boundary = {1,1,1,-1};
+  typedef MobiusFermionR FermionAction;
+  FermionAction::ImplParams Params(boundary);

  for(int m=0;m<masses.size();m++) {

@@ -339,31 +357,41 @@ int main (int argc, char ** argv)
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];

+    if ( smeared_link[m] ) Utmp = Usmr;
+    else                   Utmp = Umu;
+    
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));

-    FermActs.push_back(new MobiusFermionR(Umu,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }

-  LatticePropagator point_source(UGrid);
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);

-  Coordinate Origin({0,0,0,0});
-  PointSource   (Origin,point_source);
-  Z2WallSource  (RNG4,0,z2wall_source);
-  GFWallSource  (0,gfwall_source);
+  int tslice = 0;
+  //////////////////////////////////////////////////////////////////////
+  // RNG seeded for Z2 wall
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
+  Z2WallSource  (RNG4,tslice,z2wall_source);
+  GFWallSource  (tslice,gfwall_source);

-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
-  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  std::vector<LatticePropagator> GFProps   (nmass,UGrid);

  for(int m=0;m<nmass;m++) {

+    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<<std::endl;
    Solve(*FermActs[m],z2wall_source    ,Z2Props[m]);
+    std::cout << GridLogMessage << " Mass " <<m << " gfwall source "<<norm2(gfwall_source)<<std::endl;
    Solve(*FermActs[m],gfwall_source    ,GFProps[m]);

+    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<< " " << norm2(gfwall_source)<<std::endl;
+  
  }

  LatticeComplex phase(UGrid);
@@ -383,14 +411,15 @@ int main (int argc, char ** argv)
    std::stringstream wssg,wssz;

    /// Point sinks
-    ssg<<config<< "_m" << m1 << "_m"<< m2 << "p_gf_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "p_z2_meson.xml";
+    ssg<<config<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";

    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
+    MesonTrace(ssg.str(),GFProps[m1],GFProps[m2],phase);

    /// Wall sinks
-    wssg<<config<< "_m" << m1 << "_m"<< m2 << "w_gf_meson.xml";
-    wssz<<config<< "_m" << m1 << "_m"<< m2 << "w_z2_meson.xml";
+    wssg<<config<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
+    wssz<<config<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";

    WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
    WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
--- a/scripts/hmc.sh
+++ b/scripts/hmc.sh
@@ -1,19 +1,27 @@
 #!/bin/bash

 LOG=$1
-SWEEPS=`grep dH $LOG | wc -l`
-SWEEPS=`expr $SWEEPS - 80`
+SWEEPS=`grep dH.= $LOG | wc -l`
+SWEEPS=`expr $SWEEPS - 100`
 echo
 echo $SWEEPS thermalised sweeps
 echo
-plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10} END { print S/NR} ' `
-plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12} END { print S/NR} ' `
+plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12 ; SS=SS+$12*$12 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Plaquette: $plaq (${plaqe})"
 echo

-dHv=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt(SS/NR) } ' `
-edH=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$10)} END { print S/NR} '`
-echo "<e-dH>: $edH"
+grep  Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12/20; if(NR%20==0){ print NR/20, " ", S; S=0;} } '  > plaq.binned
+
+plaq=`cat plaq.binned  | awk '{ S=S+$2} END { print S/NR} ' `
+plaqe=`cat plaq.binned | awk '{ S=S+$2 ; SS=SS+$2*$2 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+echo "Binned Plaquette: $plaq (${plaqe})"
+echo
+
+dHv=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+$16 ; SS=SS+$16*$16 } END { print sqrt(SS/NR) } ' `
+edH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16)} END { print S/NR} '`
+dedH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16); SS=SS+exp(-$16)*exp(-$16)} END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } '`
+echo "<e-dH>: $edH (${dedH})"
 echo "<rms dH>: $dHv"

 TRAJ=`grep Acc $LOG | wc -l`
@@ -22,12 +30,13 @@ PACC=`expr  100 \* ${ACC} / ${TRAJ} `
 echo
 echo "Acceptance $PACC %  $ACC / $TRAJ "

-grep Plaq $LOG | awk '{ print $10 }' | uniq > plaq.dat
-grep dH $LOG | awk '{ print $10 }' > dH.dat
-echo set yrange [-0.2:1.0] > plot.gnu
+grep Plaq $LOG | awk '{ print $12 }' | uniq > plaq.dat
+grep dH.= $LOG | awk '{ print $16 }' > dH.dat
+echo set yrange [0.58:0.60] > plot.gnu
 echo set terminal 'pdf' >> plot.gnu
+echo "f(x) =0.588" >> plot.gnu
 echo "set output 'plaq.${LOG}.pdf'" >> plot.gnu
-echo "plot 'plaq.dat' w l, 'dH.dat' w l " >> plot.gnu
+echo "plot 'plaq.dat' w l, f(x) " >> plot.gnu
 echo
 gnuplot plot.gnu >& gnu.errs
 open plaq.${LOG}.pdf
--- a/systems/Crusher/config-command
+++ b/systems/Crusher/config-command
@@ -0,0 +1,12 @@
+../../configure --enable-comms=mpi-auto \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--enable-simd=GPU \
+--disable-fermion-reps \
+--disable-gparity \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
+ LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
+HIPFLAGS = --amdgpu-target=gfx90a
--- a/systems/Crusher/dwf.slurm
+++ b/systems/Crusher/dwf.slurm
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Begin LSF Directives
+#SBATCH -A LGT104
+#SBATCH -t 01:00:00
+##SBATCH -U openmpThu
+##SBATCH -p ecp
+#SBATCH -J DWF
+#SBATCH -o DWF.%J
+#SBATCH -e DWF.%J
+#SBATCH -N 1
+#SBATCH -n 1
+#SBATCH --exclusive  
+
+DIR=.
+module list
+#export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#export MPICH_SMP_SINGLE_COPY_MODE=NONE
+#export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export OMP_NUM_THREADS=1
+
+AT=8
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
+
+PARAMS=" --accelerator-threads ${AT} --grid 24.24.24.24 --shm-mpi 0 --mpi 1.1.1.1"
+
+srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+
+
--- a/systems/Crusher/dwf4.slurm
+++ b/systems/Crusher/dwf4.slurm
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Begin LSF Directives
+#SBATCH -A LGT104
+#SBATCH -t 01:00:00
+##SBATCH -U openmpThu
+#SBATCH -J DWF
+#SBATCH -o DWF.%J
+#SBATCH -e DWF.%J
+#SBATCH -N 1
+#SBATCH -n 4
+#SBATCH --exclusive
+
+DIR=.
+module list
+export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+export MPICH_GPU_SUPPORT_ENABLED=1
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export MPICH_SMP_SINGLE_COPY_MODE=NONE
+#export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export OMP_NUM_THREADS=4
+
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
+PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
+
+srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+
+
--- a/systems/Crusher/dwf8.slurm
+++ b/systems/Crusher/dwf8.slurm
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Begin LSF Directives
+#SBATCH -A LGT104
+#SBATCH -t 01:00:00
+##SBATCH -U openmpThu
+#SBATCH -J DWF
+#SBATCH -o DWF.%J
+#SBATCH -e DWF.%J
+#SBATCH -N 1
+#SBATCH -n 8
+#SBATCH --exclusive
+
+DIR=.
+module list
+export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#export MPICH_SMP_SINGLE_COPY_MODE=NONE
+#export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export OMP_NUM_THREADS=1
+
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
+PARAMS=" --accelerator-threads 8 --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
+
+srun --gpus-per-task 1 -n8 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+
+
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`../CompactWilsonCloverFermionInstantiation.cc.master`