Merge branch 'feature/ddhmc' of https://github.com/paboyle/Grid into feature/ddhmc

Several updates
Correct mass
2025-06-14 13:57:07 +01:00 · 2022-02-14 17:33:17 +01:00 · 2022-02-14 17:29:41 +01:00 · 2021-11-17 21:40:04 +00:00 · 2021-10-07 20:06:55 +01:00 · 2021-10-07 20:06:17 +01:00
121 changed files with 9991 additions and 1163 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@ -34,6 +34,9 @@ directory
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wpsabi"
 #endif
 //disables and intel compiler specific warning (in json.hpp)
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -223,9 +223,14 @@ class SchurOperatorBase :  public LinearOperatorBase<Field> {
    Mpc(in,tmp);
    MpcDag(tmp,out);
  }
  virtual  void MpcMpcDag(const Field &in, Field &out) {
    Field tmp(in.Grid());
    tmp.Checkerboard() = in.Checkerboard();
    MpcDag(in,tmp);
    Mpc(tmp,out);
  }
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    out.Checkerboard() = in.Checkerboard();
+    HermOp(in,out);
    MpcDagMpc(in,out);
    ComplexD dot= innerProduct(in,out); 
    n1=real(dot);
    n2=norm2(out);
@ -276,6 +281,16 @@ template<class Matrix,class Field>
      axpy(out,-1.0,tmp,out);
    }
 };
 // Mpc MpcDag system presented as the HermOp
 template<class Matrix,class Field>
 class SchurDiagMooeeDagOperator :  public SchurDiagMooeeOperator<Matrix,Field> {
 public:
  virtual void HermOp(const Field &in, Field &out){
    out.Checkerboard() = in.Checkerboard();
    this->MpcMpcDag(in,out);
  }
  SchurDiagMooeeDagOperator (Matrix &Mat): SchurDiagMooeeOperator<Matrix,Field>(Mat){};
 };
 template<class Matrix,class Field>
  class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
 protected:
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@ -292,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -102,7 +102,7 @@ public:
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      TrueResidual = std::sqrt(a/ssq);
-      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
+      std::cout << GridLogMessage << "ConjugateGradient guess is converged already "<<TrueResidual<< " tol "<< Tolerance<< std::endl;
      IterationsToComplete = 0;	
      return;
    }
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -48,19 +48,29 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
-    
+
-    MixedPrecisionConjugateGradient(RealD tol, 
+    MixedPrecisionConjugateGradient(RealD Tol,
 				    Integer maxinnerit, 
 				    Integer maxouterit, 
 				    GridBase* _sp_grid, 
 				    LinearOperatorBase<FieldF> &_Linop_f, 
 				    LinearOperatorBase<FieldD> &_Linop_d) :
      MixedPrecisionConjugateGradient(Tol, Tol, maxinnerit, maxouterit, _sp_grid, _Linop_f, _Linop_d) {};
    MixedPrecisionConjugateGradient(RealD Tol,
 				    RealD InnerTol,
 				    Integer maxinnerit, 
 				    Integer maxouterit, 
 				    GridBase* _sp_grid, 
 				    LinearOperatorBase<FieldF> &_Linop_f, 
 				    LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
-      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      Tolerance(Tol), InnerTolerance(InnerTol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
-      OuterLoopNormMult(100.), guesser(NULL){ };
+      OuterLoopNormMult(100.), guesser(NULL){ assert(InnerTol < 1.0e-1);};
    void useGuesser(LinearFunction<FieldF> &g){
      guesser = &g;
@ -79,6 +89,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;
    GridBase* DoublePrecGrid = src_d_in.Grid();
    //Generate precision change workspaces
    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
@ -119,7 +134,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@ -137,7 +152,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@ -149,6 +164,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@ -182,6 +182,9 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@ -0,0 +1,411 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d, wk_f_from_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f, wk_d_from_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d, wk_f_from_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@ -40,7 +40,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
-   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
+   * L^{-dag}= ( 1      -Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
@ -82,7 +82,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
-   * TODO: Deflation 
+   *
   *
   */
 namespace Grid {
@ -97,6 +98,7 @@ namespace Grid {
  protected:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
    bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
@ -219,13 +221,20 @@ namespace Grid {
 	/////////////////////////////////////////////////
 	// Check unprec residual if possible
 	/////////////////////////////////////////////////
-	if ( ! subGuess ) {
+	if ( ! subGuess ) {	  
-	  _Matrix.M(out[b],resid); 
+
 	  if ( this->adjoint() ) _Matrix.Mdag(out[b],resid); 
 	  else                   _Matrix.M(out[b],resid); 
 	  resid = resid-in[b];
 	  RealD ns = norm2(in[b]);
 	  RealD nr = norm2(resid);
-	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
 	  if ( this->adjoint() ) 
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	  else                   
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
 	}
@ -279,12 +288,21 @@ namespace Grid {
      // Verify the unprec residual
      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
+
 	std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
 	if ( this->adjoint() ) _Matrix.Mdag(out,resid); 
 	else                   _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
-        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
+	  if ( this->adjoint() ) 
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
 	  else                   
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
      } else {
        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
@ -293,6 +311,7 @@ namespace Grid {
    /////////////////////////////////////////////////////////////
    // Override in derived. 
    /////////////////////////////////////////////////////////////
    virtual bool adjoint(void) { return false; }
    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
@ -646,6 +665,127 @@ namespace Grid {
        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
      }
  };
  /*
   * Red black Schur decomposition
   *
   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
   *                =         L                     D                     U
   *
   * L^-1 = (1              0 )
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
   *        (0    1           )
   * U^{dag} = ( 1                 0)
   *           (Meo^dag Mee^{-dag} 1)
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   *
   *
   ***********************
   *     M^dag psi = eta
   ***********************
   *
   * Really for Mobius: (Wilson - easier to just use gamma 5 hermiticity)
   *
   *    Mdag psi     =         Udag  Ddag  Ldag psi = eta
   *
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   *
   *
   * i)                D^dag phi =  (U^{-dag}  eta)
   *                        eta'_e = eta_e
   *                        eta'_o = (eta_o - Meo^dag Mee^{-dag} eta_e)
   * 
   *      phi_o = D_oo^-dag eta'_o = D_oo^-dag (eta_o - Meo^dag Mee^{-dag} eta_e)
   *
   *      phi_e = D_ee^-dag eta'_e = D_ee^-dag eta_e
   * 
   * Solve: 
   *
   *      D_oo D_oo^dag phi_o = D_oo (eta_o - Meo^dag Mee^{-dag} eta_e)
   *
   * ii) 
   *      phi = L^dag psi => psi = L^-dag phi. 
   *
   * L^{-dag} = ( 1      -Mee^{-dag} Moe^{dag} )
   *            ( 0       1                    )
   *
   *   => sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
   *   => sol_o = phi_o
   */
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal has Mooee on it, but solve the Adjoint system
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagMooeeDagSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    virtual bool adjoint(void) { return true; }
    SchurRedBlackDiagMooeeDagSolve(OperatorFunction<Field> &HermitianRBSolver,
 				   const bool initSubGuess = false,
 				   const bool _solnAsInitGuess = false)  
      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
    //////////////////////////////////////////////////////
    // Override RedBlack specialisation
    //////////////////////////////////////////////////////
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe^dag MeeInvDag source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInvDag(src_e,tmp);  assert(  tmp.Checkerboard() ==Even);
      _Matrix.MeooeDag   (tmp,Mtmp);   assert( Mtmp.Checkerboard() ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
      // get the right Mpc
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      _HermOpEO.Mpc(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);
    }
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field  sol_e(grid);
      Field  tmp(grid);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
      // sol_o = phi_o
      ///////////////////////////////////////////////////
      _Matrix.MeooeDag(sol_o,tmp);      assert(tmp.Checkerboard()==Even);
      tmp = src_e-tmp;                  assert(tmp.Checkerboard()==Even);
      _Matrix.MooeeInvDag(tmp,sol_e);   assert(sol_e.Checkerboard()==Even);
      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
    }
  };
 }
 #endif
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@ -0,0 +1,55 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
  typedef typename vobj::tensor_reduced normtype;
  typedef typename normtype::scalar_object scalar;
  std::vector<scalar> sff;
  sliceSum(ff,sff,mu);
  for(int t=0;t<sff.size();t++){
    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
  }
 }
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -32,8 +32,9 @@
 #include <random>
 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
 #include <Grid/random/gaussian.h>
 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@ -142,7 +143,7 @@ public:
  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
+  std::vector<Grid::gaussian_distribution<RealD> >    _gaussian;
  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;
@ -243,7 +244,7 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }
@ -357,7 +358,7 @@ public:
    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -785,7 +785,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@ -1010,54 +1010,96 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
-//Convert a Lattice from one precision to another
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
-template<class VobjOut, class VobjIn>
+class precisionChangeWorkspace{
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+  std::pair<Integer,Integer>* fmap_device; //device pointer
-{
+public:
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
-  for(int d=0;d<out.Grid()->Nd();d++){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+    assert(out_grid->Nd() == in_grid->Nd());
-  }
+    for(int d=0;d<out_grid->Nd();d++){
-  out.Checkerboard() = in.Checkerboard();
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
  GridBase *in_grid=in.Grid();
  GridBase *out_grid = out.Grid();
  typedef typename VobjOut::scalar_object SobjOut;
  typedef typename VobjIn::scalar_object SobjIn;
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
    out_icoor[lane].resize(ndim);
    out_grid->iCoorFromIindex(out_icoor[lane], lane);
  }
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
  autoView( out_v , out, CpuWrite);
  thread_for(out_oidx,out_grid->oSites(),{
    Coordinate out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
    Coordinate lcoor(out_grid->Nd());
    for(int lane=0; lane < out_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
      ptrs[lane] = &in_slex_conv[llex];
    }
-    merge(out_v[out_oidx], ptrs, 0);
+    int Nsimd_out = out_grid->Nsimd();
-  });
+
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another
 //Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@ -39,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@ -198,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@ -101,6 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
 template<typename vtype> using iLorentzVector             = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@ -110,8 +114,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@ -158,7 +164,16 @@ typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
-// LorentzColour
+// LorentzVector
 typedef iLorentzVector<Complex  > LorentzVector;
 typedef iLorentzVector<ComplexF > LorentzVectorF;
 typedef iLorentzVector<ComplexD > LorentzVectorD;
 typedef iLorentzVector<vComplex > vLorentzVector;
 typedef iLorentzVector<vComplexF> vLorentzVectorF;
 typedef iLorentzVector<vComplexD> vLorentzVectorD;
 // LorentzColourMatrix
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
 typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
@ -176,6 +191,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@ -220,6 +245,16 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@ -263,6 +298,10 @@ typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
 typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
 typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzVector>  LatticeLorentzVector;
 typedef Lattice<vLorentzVectorF> LatticeLorentzVectorF;
 typedef Lattice<vLorentzVectorD> LatticeLorentzVectorD;
 // DoubleStored gauge field
 typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
 typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
--- a/Grid/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@ -30,8 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef GRID_QCD_ACTION_H
+#pragma once
 #define GRID_QCD_ACTION_H
 ////////////////////////////////////////////
 // Abstract base interface
@ -51,4 +50,4 @@ NAMESPACE_CHECK(Fermion);
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
 NAMESPACE_CHECK(PseudoFermion);
-#endif
+
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@ -40,6 +40,29 @@ class Action
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
  }
  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return deriv_us; };
  RealD refresh_timer(void)      { return deriv_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@ -58,6 +58,8 @@ NAMESPACE_CHECK(Scalar);
 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/domains/Domains.h>
 #include <Grid/qcd/utils/Metric.h>
 NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@ -36,28 +36,34 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
-  GparityWilsonImplParams() : twists(Nd, 0) {};
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  bool locally_periodic;
  GparityWilsonImplParams() : twists(Nd, 0), locally_periodic(false) {};
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  bool locally_periodic;
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
      locally_periodic = false;
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    locally_periodic = false;
  }
 };
 struct StaggeredImplParams {
-  StaggeredImplParams()  {};
+  bool locally_periodic;
  StaggeredImplParams() : locally_periodic(false) {};
 };
-  struct OneFlavourRationalParams : Serializable {
+struct OneFlavourRationalParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
 				    RealD, lo, 
 				    RealD, hi, 
@ -85,6 +91,50 @@ struct StaggeredImplParams {
        precision(_precision),
        BoundsCheckFreq(_BoundsCheckFreq){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DDHMCFilter.h
+++ b/Grid/qcd/action/domains/DDHMCFilter.h
@ -0,0 +1,52 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/DDHMC.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // DDHMC filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 template<typename MomentaField>
 struct DDHMCFilter: public MomentumFilterBase<MomentaField>
 {
  Coordinate Block;
  int Width;
  DDHMCFilter(const Coordinate &_Block): Block(_Block) {}
  void applyFilter(MomentaField &P) const override
  {
    DomainDecomposition Domains(Block);
    Domains.ProjectDDHMC(P);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DirichletFilter.h
+++ b/Grid/qcd/action/domains/DirichletFilter.h
@ -0,0 +1,98 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/momentum/DirichletFilter.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 #include <Grid/qcd/action/domains/DomainDecomposition.h>
 NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct DirichletFilter: public MomentumFilterBase<MomentaField>
 {
  Coordinate Block;
  DirichletFilter(const Coordinate &_Block): Block(_Block) {}
  // Edge detect using domain projectors
  void applyFilter (MomentaField &U) const override
  {
    DomainDecomposition Domains(Block);
    GridBase *grid = U.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid);   one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger  omega(grid);
    LatticeInteger  omegabar(grid);
    LatticeInteger  tmp(grid);
    omega=one;    Domains.ProjectDomain(omega,0);
    omegabar=one; Domains.ProjectDomain(omegabar,1);
    LatticeInteger nface(grid); nface=Zero();
    MomentaField projected(grid); projected=Zero();
    typedef decltype(PeekIndex<LorentzIndex>(U,0)) MomentaLinkField;
    MomentaLinkField  Umu(grid);
    MomentaLinkField   zz(grid); zz=Zero();
    int dims = grid->Nd();
    Coordinate Global=grid->GlobalDimensions();
    assert(dims==Nd);
    for(int mu=0;mu<Nd;mu++){
      if ( Block[mu]!=0 ) {
 	Umu = PeekIndex<LorentzIndex>(U,mu);
 	// Upper face 
 	tmp = Cshift(omegabar,mu,1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	Umu = where(face,zz,Umu);
 	PokeIndex<LorentzIndex>(U, Umu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DomainDecomposition.h
+++ b/Grid/qcd/action/domains/DomainDecomposition.h
@ -0,0 +1,187 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/domains/DomainDecomposition.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 NAMESPACE_BEGIN(Grid);
 struct DomainDecomposition
 {
  Coordinate Block;
  static constexpr RealD factor = 0.6;
  DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);};
  template<class Field>
  void ProjectDomain(Field &f,Integer domain)
  {
    GridBase *grid = f.Grid();
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Field   zz(grid);  zz = Zero();
    LatticeInteger coor(grid);
    LatticeInteger domaincoor(grid);
    LatticeInteger mask(grid); mask = Integer(1);
    LatticeInteger zi(grid);     zi = Integer(0);
    for(int d=0;d<Nd;d++){
      Integer B= Block[d];
      if ( B ) {
 	LatticeCoordinate(coor,d+isDWF);
 	domaincoor = mod(coor,B);
 	mask = where(domaincoor==Integer(0),zi,mask);
 	mask = where(domaincoor==Integer(B-1),zi,mask);
      }
    }
    if ( !domain )
      f = where(mask==Integer(1),f,zz);
    else 
      f = where(mask==Integer(0),f,zz);
  };
  template<class GaugeField>
  void ProjectDDHMC(GaugeField &U)
  {
    GridBase *grid = U.Grid();
    Coordinate Global=grid->GlobalDimensions();
    GaugeField zzz(grid); zzz = Zero();
    LatticeInteger coor(grid); 
    GaugeField Uorg(grid); Uorg = U;
    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
    ////////////////////////////////////////////////////
    // Zero BDY layers
    ////////////////////////////////////////////////////
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	LatticeCoordinate(coor,mu);
 	////////////////////////////////
 	// OmegaBar - zero all links contained in slice B-1,0 and
 	// mu links connecting to Omega
 	////////////////////////////////
 	U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    ////////////////////////////////////////////
    // Omega interior slow the evolution
    // Tricky as we need to take the smallest of values imposed by each cut
    // Do them in order or largest to smallest and smallest writes last
    ////////////////////////////////////////////
    RealD f= factor;
 #if 0    
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-5),Uorg*f,U); 
 	U = where(mod(coor,B1)==Integer(4)   ,Uorg*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-6),Uorg_mu*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(4)   ,Uorg_mu*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
 #endif
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-4),Uorg*f*f,U); 
 	U = where(mod(coor,B1)==Integer(3)   ,Uorg*f*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-5),Uorg_mu*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(3)   ,Uorg_mu*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-3),Uorg*f*f*f,U); 
 	U = where(mod(coor,B1)==Integer(2)   ,Uorg*f*f*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-4),Uorg_mu*f*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(2)   ,Uorg_mu*f*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-2),zzz,U); 
 	U = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	// Perp links
 	U_mu = where(mod(coor,B1)==Integer(B1-3),Uorg_mu*f*f*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(1)   ,Uorg_mu*f*f*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/Domains.h
+++ b/Grid/qcd/action/domains/Domains.h
@ -0,0 +1,39 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/momentum/Domains.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 #include <Grid/qcd/action/domains/DomainDecomposition.h>
 #include <Grid/qcd/action/domains/MomentumFilter.h>
 #include <Grid/qcd/action/domains/DirichletFilter.h>
 #include <Grid/qcd/action/domains/DDHMCFilter.h>
--- a/Grid/qcd/hmc/integrators/MomentumFilter.h
+++ b/Grid/qcd/hmc/integrators/MomentumFilter.h
@ -28,8 +28,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
-#ifndef MOMENTUM_FILTER
+#pragma once 
 #define MOMENTUM_FILTER
 NAMESPACE_BEGIN(Grid);
@ -37,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct MomentumFilterBase{
-  virtual void applyFilter(MomentaField &P) const;
+  virtual void applyFilter(MomentaField &P) const = 0;
 };
 //Do nothing
@ -90,5 +89,3 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@ -60,6 +60,8 @@ public:
  ///////////////////////////////////////////////////////////////
  virtual void Dminus(const FermionField &psi, FermionField &chi);
  virtual void DminusDag(const FermionField &psi, FermionField &chi);
  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported);
  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported);
  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
  virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
  virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
--- a/Grid/qcd/action/fermion/DirichletFermionOperator.h
+++ b/Grid/qcd/action/fermion/DirichletFermionOperator.h
@ -0,0 +1,185 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/DirichletFermionOperator.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////
 // Wrap a fermion operator in Dirichlet BC's at node boundary
 ////////////////////////////////////////////////////////////////
 template<class Impl>
 class DirichletFermionOperator : public FermionOperator<Impl>
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  // Data members
  int CommsMode;
  Coordinate Block;
  DirichletFilter<GaugeField> Filter;
  FermionOperator<Impl> & FermOp;
  // Constructor / bespoke
  DirichletFermionOperator(FermionOperator<Impl> & _FermOp, Coordinate &_Block)
    : FermOp(_FermOp), Block(_Block), Filter(Block)
  {
    // Save what the comms mode should be under normal BCs
    CommsMode = WilsonKernelsStatic::Comms;
    assert((CommsMode == WilsonKernelsStatic::CommsAndCompute)
         ||(CommsMode == WilsonKernelsStatic::CommsThenCompute));
    // Check the block size divides local lattice
    GridBase *grid = FermOp.GaugeGrid();
    int blocks_per_rank = 1;
    Coordinate LocalDims = grid->LocalDimensions();
    Coordinate GlobalDims= grid->GlobalDimensions();
    assert(Block.size()==LocalDims.size());
    for(int d=0;d<LocalDims.size();d++){
      if (Block[d]&&(Block[d]<=GlobalDims[d])){
 	int r = LocalDims[d] % Block[d];
 	assert(r == 0);
 	blocks_per_rank *= (LocalDims[d] / Block[d]);
      }
    }
    // Even blocks per node required // could be relaxed but inefficient use of hardware as idle nodes in boundary operator R
    assert( blocks_per_rank != 0);
    // Possible checks that SIMD lanes are used with full occupancy???
  };
  virtual ~DirichletFermionOperator(void) = default;
  void DirichletOn(void)   {
    assert(WilsonKernelsStatic::Comms!= WilsonKernelsStatic::CommsDirichlet);
    //    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsDirichlet;
  }
  void DirichletOff(void)  {
    //    assert(WilsonKernelsStatic::Comms== WilsonKernelsStatic::CommsDirichlet);
    //    WilsonKernelsStatic::Comms = CommsMode;
  }
  // Implement the full interface
  virtual FermionField &tmp(void) { return FermOp.tmp(); };
  virtual GridBase *FermionGrid(void)         { return FermOp.FermionGrid(); }
  virtual GridBase *FermionRedBlackGrid(void) { return FermOp.FermionRedBlackGrid(); }
  virtual GridBase *GaugeGrid(void)           { return FermOp.GaugeGrid(); }
  virtual GridBase *GaugeRedBlackGrid(void)   { return FermOp.GaugeRedBlackGrid(); }
  // override multiply
  virtual void  M    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.M(in,out);    DirichletOff();  };
  virtual void  Mdag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mdag(in,out); DirichletOff();  };
  // half checkerboard operaions
  virtual void   Meooe       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Meooe(in,out);    DirichletOff(); };  
  virtual void   MeooeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MeooeDag(in,out); DirichletOff(); };
  virtual void   Mooee       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mooee(in,out);    DirichletOff(); };
  virtual void   MooeeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeDag(in,out); DirichletOff(); };
  virtual void   MooeeInv    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInv(in,out); DirichletOff(); };
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInvDag(in,out); DirichletOff(); };
  // non-hermitian hopping term; half cb or both
  virtual void Dhop  (const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.Dhop(in,out,dag);    DirichletOff(); };
  virtual void DhopOE(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopOE(in,out,dag);  DirichletOff(); };
  virtual void DhopEO(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopEO(in,out,dag);  DirichletOff(); };
  virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp) { DirichletOn(); FermOp.DhopDir(in,out,dir,disp);  DirichletOff(); };
  // force terms; five routines; default to Dhop on diagonal
  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MDeriv(mat,U,V,dag);};
  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MoeDeriv(mat,U,V,dag);};
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeoDeriv(mat,U,V,dag);};
  virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MooDeriv(mat,U,V,dag);};
  virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeeDeriv(mat,U,V,dag);};
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDeriv(mat,U,V,dag);};
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivEO(mat,U,V,dag);};
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivOE(mat,U,V,dag);};
  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp){FermOp.Mdir(in,out,dir,disp);};
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)    {FermOp.MdirAll(in,out);};
  ///////////////////////////////////////////////
  // Updates gauge field during HMC
  ///////////////////////////////////////////////
  DoubledGaugeField &GetDoubledGaugeField(void){ return FermOp.GetDoubledGaugeField(); };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return FermOp.GetDoubledGaugeFieldE(); };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return FermOp.GetDoubledGaugeFieldO(); };
  virtual void ImportGauge(const GaugeField & _U)
  {
    GaugeField U = _U;
    // Filter gauge field to apply Dirichlet
    Filter.applyFilter(U);
    FermOp.ImportGauge(U);
  }
  ///////////////////////////////////////////////
  // Physical field import/export
  ///////////////////////////////////////////////
  virtual void Dminus(const FermionField &psi, FermionField &chi)    { FermOp.Dminus(psi,chi); }
  virtual void DminusDag(const FermionField &psi, FermionField &chi) { FermOp.DminusDag(psi,chi); }
  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)   { FermOp.ImportFourDimPseudoFermion(input,imported);}
  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported){ FermOp.ExportFourDimPseudoFermion(solution,exported);}
  virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)  { FermOp.ImportPhysicalFermionSource(input,imported);}
  virtual void ImportUnphysicalFermion(const FermionField &input,FermionField &imported)      { FermOp.ImportUnphysicalFermion(input,imported);}
  virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported) {FermOp.ExportPhysicalFermionSolution(solution,exported);}
  virtual void ExportPhysicalFermionSource(const FermionField &solution,FermionField &exported)   {FermOp.ExportPhysicalFermionSource(solution,exported);}
  //////////////////////////////////////////////////////////////////////
  // Should never be used
  //////////////////////////////////////////////////////////////////////
  virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {assert(0);}
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { assert(0);}
  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
 					PropagatorField &q_in_2,
 					PropagatorField &q_out,
 					PropagatorField &phys_src,
 					Current curr_type,
 					unsigned int mu)
  {assert(0);};
  virtual void SeqConservedCurrent(PropagatorField &q_in, 
 				   PropagatorField &q_out,
 				   PropagatorField &phys_src,
 				   Current curr_type,
 				   unsigned int mu,
 				   unsigned int tmin, 
 				   unsigned int tmax,
 				   ComplexField &lattice_cmplx)
  {assert(0);};
      // Only reimplemented in Wilson5D 
      // Default to just a zero correlation function
  virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=Zero(); };
  virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=Zero(); };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@ -101,6 +101,12 @@ NAMESPACE_CHECK(WilsonTM5);
 #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
 #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 #include <Grid/qcd/action/fermion/MADWF.h>
 ////////////////////////////////////////////////////////////////////
 // DDHMC related 
 ////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/DirichletFermionOperator.h>
 #include <Grid/qcd/action/fermion/SchurFactoredFermionOperator.h>
 NAMESPACE_CHECK(DWFutils);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@ -25,8 +25,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_CORE_H
+#pragma once
 #define  GRID_QCD_FERMION_CORE_H
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
@ -45,4 +44,3 @@ NAMESPACE_CHECK(FermionOperator);
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
 NAMESPACE_CHECK(Kernels);
 #endif
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@ -140,6 +140,9 @@ public:
  // Updates gauge field during HMC
  ///////////////////////////////////////////////
  virtual void ImportGauge(const GaugeField & _U)=0;
  virtual DoubledGaugeField &GetDoubledGaugeField(void)  =0;
  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void)  =0;
  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void)  =0;
  //////////////////////////////////////////////////////////////////////
  // Conserved currents, either contract at sink or insert sequentially.
@ -171,6 +174,16 @@ public:
      ///////////////////////////////////////////////
      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
      virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)
      {
 	imported = input;
      };
      virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported)
      {
 	exported=solution;
      };
      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
      {
 	imported = input;
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@ -30,6 +30,18 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+    //If this site is an global boundary site, perform the G-parity flavor twist
-
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@ -197,6 +209,19 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@ -207,14 +232,19 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-        
+
-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
-          
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
-      LatticeCoordinate(coor,mu);
+    for(int mu=0;mu<Nd-1;mu++){
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@ -260,6 +290,38 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@ -298,28 +360,48 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
-  
+ 
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
+    int Ls=Btilde.Grid()->_fdimensions[0];
-    int Ls = Btilde.Grid()->_fdimensions[0];
+    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      GridBase *GaugeGrid = mat.Grid();
-      autoView( Atilde_v , Atilde, CpuRead);
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
-      autoView( Btilde_v , Btilde, CpuRead);
+
-      thread_for(ss,tmp.Grid()->oSites(),{
+      if( Params.twists[mu] ){
-	  for (int s = 0; s < Ls; s++) {
+	LatticeCoordinate(coor,mu);
-	    int sF = s + Ls * ss;
+      }
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      autoView( mat_v , mat, AcceleratorWrite);
-	  }
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-	});
+      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@ -141,8 +141,11 @@ public:
  void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
-  DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
  virtual DoubledGaugeField &GetU(void)   { return Umu ; } ;
  virtual DoubledGaugeField &GetUUU(void) { return UUUmu; };
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -160,17 +160,20 @@ public:
 			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
 			     const ImplParams &p= ImplParams());
-    // DoubleStore gauge field in operator
+  // DoubleStore gauge field in operator
-    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
  void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
-    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
+  void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
-    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-    // Give a reference; can be used to do an assignment or copy back out after import
+  // Give a reference; can be used to do an assignment or copy back out after import
-    // if Carleton wants to cache them and not use the ImportSimple
+  // if Carleton wants to cache them and not use the ImportSimple
-    DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
-    DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
-    void CopyGaugeCheckerboards(void);
+  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
-    
+  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  DoubledGaugeField &GetUUU(void) { return UUUmu; };
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@ -135,6 +135,9 @@ public:
  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_U );
  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  void CopyGaugeCheckerboards(void);
--- a/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
+++ b/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
@ -0,0 +1,534 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/SchurFactoredFermionOperator.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
 #include <Grid/qcd/action/domains/Domains.h>
 NAMESPACE_BEGIN(Grid);
  ////////////////////////////////////////////////////////
  // Some explanation of class structure for domain decomposition:
  //
  // Need a dirichlet operator for two flavour determinant - acts on both Omega and OmegaBar.
  //
  // Possible gain if the global sums and CG are run independently?? Could measure this.
  //
  // Types of operations
  //
  // 1) assemble local det dOmega det dOmegaBar pseudofermion
  //
  // - DirichletFermionOperator - can either do a global solve, or independent/per cell coefficients.
  //
  // 2) assemble dOmegaInverse and dOmegaBarInverse in R
  //
  // - DirichletFermionOperator - can also be used to 
  //                                       - need two or more cells per node. Options
  //                                       - a) solve one cell at a time, no new code, CopyRegion and reduced /split Grids
  //                                       - b) solve multiple cells in parallel. predicated dslash implementation
  //
  //                                       - b) has more parallelism, experience with block solver suggest might not be aalgorithmically inefficient
  //                                         a) has more cache friendly and easier code.
  //                                         b) is easy to implement in a "trial" or inefficient code with projection.
  //
  // 3)  Additional functionality for domain operations
  //
  // - SchurFactoredFermionOperator  - Need a DDHMC utility - whether used in two flavour or one flavour 
  //
  // - dBoundary - needs non-dirichlet operator
  // - Contains one Dirichlet Op, and one non-Dirichlet op. Implements dBoundary etc...
  // - The Dirichlet ops can be passed to dOmega(Bar) solvers etc...
  //
  ////////////////////////////////////////////////////////
 template<class ImplD,class ImplF>
 class SchurFactoredFermionOperator : public ImplD
 {
  INHERIT_IMPL_TYPES(ImplD);
  typedef typename ImplF::FermionField FermionFieldF;
  typedef typename ImplD::FermionField FermionFieldD;
  typedef SchurDiagMooeeOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorDagD;
  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorDagF;
  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
 							  FermionOperator<ImplF>,
 							  LinearOperatorD,
 							  LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
 							  FermionOperator<ImplF>,
 							  LinearOperatorDagD,
 							  LinearOperatorDagF> MxDagPCG;
 public:
  GridBase *FermionGrid(void) { return PeriodicFermOpD.FermionGrid(); };
  GridBase *GaugeGrid(void)   { return PeriodicFermOpD.GaugeGrid(); };
  FermionOperator<ImplD> & DirichletFermOpD;
  FermionOperator<ImplF> & DirichletFermOpF;
  FermionOperator<ImplD> & PeriodicFermOpD; 
  FermionOperator<ImplF> & PeriodicFermOpF; 
  LinearOperatorD DirichletLinOpD;
  LinearOperatorF DirichletLinOpF;
  LinearOperatorD PeriodicLinOpD;
  LinearOperatorF PeriodicLinOpF;
  LinearOperatorDagD DirichletLinOpDagD;
  LinearOperatorDagF DirichletLinOpDagF;
  LinearOperatorDagD PeriodicLinOpDagD;
  LinearOperatorDagF PeriodicLinOpDagF;
  // Can tinker with these in the pseudofermion for force vs. action solves
  Integer maxinnerit;
  Integer maxouterit;
  RealD tol;
  RealD tolinner;
  Coordinate Block;
  DomainDecomposition Domains;
  SchurFactoredFermionOperator(FermionOperator<ImplD>  & _PeriodicFermOpD,
 			       FermionOperator<ImplF>  & _PeriodicFermOpF,
 			       FermionOperator<ImplD>  & _DirichletFermOpD,
 			       FermionOperator<ImplF>  & _DirichletFermOpF,
 			       Coordinate &_Block)
    : Block(_Block), Domains(Block),
      PeriodicFermOpD(_PeriodicFermOpD),
      PeriodicFermOpF(_PeriodicFermOpF),
      DirichletFermOpD(_DirichletFermOpD),
      DirichletFermOpF(_DirichletFermOpF),
      DirichletLinOpD(DirichletFermOpD),
      DirichletLinOpF(DirichletFermOpF),
      PeriodicLinOpD(PeriodicFermOpD),
      PeriodicLinOpF(PeriodicFermOpF),
      DirichletLinOpDagD(DirichletFermOpD),
      DirichletLinOpDagF(DirichletFermOpF),
      PeriodicLinOpDagD(PeriodicFermOpD),
      PeriodicLinOpDagF(PeriodicFermOpF)
  {
    tol=1.0e-10;
    tolinner=1.0e-6;
    maxinnerit=1000;
    maxouterit=10;
    assert(PeriodicFermOpD.FermionGrid() == DirichletFermOpD.FermionGrid());
    assert(PeriodicFermOpF.FermionGrid() == DirichletFermOpF.FermionGrid());
  };
  enum Domain { Omega=0, OmegaBar=1 };
  void ImportGauge(const GaugeField &Umu)
  {
    // Single precision will update in the mixed prec CG
    PeriodicFermOpD.ImportGauge(Umu);
    GaugeField dUmu(Umu.Grid());
    dUmu=Umu;
    //    DirchletBCs(dUmu);
    DirichletFilter<GaugeField> Filter(Block);
    Filter.applyFilter(dUmu);
    DirichletFermOpD.ImportGauge(dUmu);
  }
 /*
  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
  {
    assert((sgn==1)||(sgn==-1));
    Real rsgn = sgn;
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
      Gamma::Algebra::GammaY,
      Gamma::Algebra::GammaZ,
      Gamma::Algebra::GammaT
    };
    GridBase *grid = f.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid); one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger nface(grid); nface=Zero();
    FermionField projected(grid); projected=Zero();
    FermionField sp_proj  (grid);
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Coordinate Global=grid->GlobalDimensions();
    for(int mu=0;mu<Nd;mu++){
      if ( Block[mu] <= Global[mu+isDWF] ) {
 	// need to worry about DWF 5th dim first
 	LatticeCoordinate(coor,mu+isDWF); 
 	face = where(mod(coor,Block[mu]) == Integer(0),one,zero );
 	nface = nface + face;
 	Gamma G(Gmu[mu]);
 	// Lower face receives (1-gamma)/2 in normal forward hopping term
 	sp_proj  = 0.5*(f-G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	//projected= where(face,f,projected);
 	face = where(mod(coor,Block[mu]) == Integer(Block[mu]-1) ,one,zero );
 	nface = nface + face;
 	// Upper face receives (1+gamma)/2 in normal backward hopping term
 	sp_proj = 0.5*(f+G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	//projected= where(face,f,projected);
      }
    }
    // Initial Zero() where nface==0.
    // Keep the spin projected faces where nface==1
    // Full spinor where nface>=2
    projected = where(nface>Integer(1),f,projected);
    f=projected;
  }
 */
  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
  {
    assert((sgn==1)||(sgn==-1));
    Real rsgn = sgn;
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
      Gamma::Algebra::GammaY,
      Gamma::Algebra::GammaZ,
      Gamma::Algebra::GammaT
    };
    GridBase *grid = f.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid);   one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger  omega(grid);
    LatticeInteger  omegabar(grid);
    LatticeInteger  tmp(grid);
    omega=one;    Domains.ProjectDomain(omega,0);
    omegabar=one; Domains.ProjectDomain(omegabar,1);
    LatticeInteger nface(grid); nface=Zero();
    FermionField projected(grid); projected=Zero();
    FermionField sp_proj  (grid);
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Coordinate Global=grid->GlobalDimensions();
    for(int mmu=0;mmu<Nd;mmu++){
      Gamma G(Gmu[mmu]);
      // need to worry about DWF 5th dim first
      int mu = mmu+isDWF;
      if ( Block[mmu] && (Block[mmu] <= Global[mu]) ) {
 	// Lower face receives (1-gamma)/2 in normal forward hopping term
 	tmp = Cshift(omegabar,mu,-1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,-1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	nface = nface + face;
 	sp_proj  = 0.5*(f-G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	// Upper face receives (1+gamma)/2 in normal backward hopping term
 	tmp = Cshift(omegabar,mu,1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	nface = nface + face;
 	sp_proj = 0.5*(f+G*f*rsgn);
 	projected= where(face,sp_proj,projected);
      }
    }
    // Initial Zero() where nface==0.
    // Keep the spin projected faces where nface==1
    // Full spinor where nface>=2
    projected = where(nface>Integer(1),f,projected);
    f=projected;
  }
  void ProjectDomain(FermionField &f,int domain)
  {
 /*
    GridBase *grid = f.Grid();
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    FermionField zz(grid); zz=Zero();
    LatticeInteger coor(grid);
    LatticeInteger domaincb(grid); domaincb=Zero();
    for(int d=0;d<Nd;d++){
      LatticeCoordinate(coor,d+isDWF);
      domaincb = domaincb + div(coor,Block[d]);
    }
    f = where(mod(domaincb,2)==Integer(domain),f,zz);
 */
    Domains.ProjectDomain(f,domain);
  };
  void ProjectOmegaBar   (FermionField &f) {ProjectDomain(f,OmegaBar);}
  void ProjectOmega      (FermionField &f) {ProjectDomain(f,Omega);}
  // See my notes(!).
  // Notation: Following Luscher, we introduce projectors $\hPdb$ with both spinor and space structure
  // projecting all spinor elements in $\Omega$ connected by $\Ddb$ to $\bar{\Omega}$,
  void ProjectBoundaryBar(FermionField &f)
  {
    ProjectBoundaryBothDomains(f,1);
    ProjectOmega(f);
  }
  // and $\hPd$ projecting all spinor elements in $\bar{\Omega}$ connected by $\Dd$ to $\Omega$.
  void ProjectBoundary   (FermionField &f)
  {
    ProjectBoundaryBothDomains(f,1);
    ProjectOmegaBar(f);
    //    DumpSliceNorm("ProjectBoundary",f,f.Grid()->Nd()-1);
  };
  void dBoundary    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    PeriodicFermOpD.M(tmp,out);
    ProjectOmega(out);
  };
  void dBoundaryDag (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    PeriodicFermOpD.Mdag(tmp,out);
    ProjectOmegaBar(out);
  };
  void dBoundaryBar (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    PeriodicFermOpD.M(tmp,out);
    ProjectOmegaBar(out);
  };
  void dBoundaryBarDag (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    PeriodicFermOpD.Mdag(tmp,out);
    ProjectOmega(out);
  };
  void dOmega       (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    DirichletFermOpD.M(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBar    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    DirichletFermOpD.M(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaDag       (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    DirichletFermOpD.Mdag(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBarDag    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    DirichletFermOpD.Mdag(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaInv   (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    dOmegaInvAndOmegaBarInv(tmp,out); // Inefficient warning
    ProjectOmega(out);
  };
  void dOmegaBarInv(FermionField &in,FermionField &out)
  {    
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    dOmegaInvAndOmegaBarInv(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaDagInv   (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBarDagInv(FermionField &in,FermionField &out)
  {    
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaInvAndOmegaBarInv(FermionField &in,FermionField &out)
  {
    MxPCG OmegaSolver(tol,
 		      tolinner,
 		      maxinnerit,
 		      maxouterit,
 		      DirichletFermOpF.FermionRedBlackGrid(),
 		      DirichletFermOpF,
 		      DirichletFermOpD,
 		      DirichletLinOpF,
 		      DirichletLinOpD);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(OmegaSolver);
    PrecSolve(DirichletFermOpD,in,out);
  };
  void dOmegaDagInvAndOmegaBarDagInv(FermionField &in,FermionField &out)
  {
    MxDagPCG OmegaDagSolver(tol,
 			    tolinner,
 			    maxinnerit,
 			    maxouterit,
 			    DirichletFermOpF.FermionRedBlackGrid(),
 			    DirichletFermOpF,
 			    DirichletFermOpD,
 			    DirichletLinOpDagF,
 			    DirichletLinOpDagD);
    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecSolve(OmegaDagSolver);
    PrecSolve(DirichletFermOpD,in,out);
  };
  // Rdag = Pdbar - DdbarDag DomegabarDagInv  DdDag DomegaDagInv Pdbar 
  void RDag(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    FermionField tmp2(PeriodicFermOpD.FermionGrid());
    out = in;
    ProjectBoundaryBar(out);
    dOmegaDagInv(out,tmp1);   
    dBoundaryDag(tmp1,tmp2);   
    dOmegaBarDagInv(tmp2,tmp1);
    dBoundaryBarDag(tmp1,tmp2); 
    out = out - tmp2;
  };
  // R = Pdbar - Pdbar DomegaInv Dd DomegabarInv Ddbar
  void R(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    FermionField tmp2(PeriodicFermOpD.FermionGrid());
    out = in;
    ProjectBoundaryBar(out);
    dBoundaryBar(out,tmp1); 
    dOmegaBarInv(tmp1,tmp2);
    dBoundary(tmp2,tmp1);   
    dOmegaInv(tmp1,tmp2);   
    out = in - tmp2 ;       
    ProjectBoundaryBar(out);
    //    DumpSliceNorm("R",out,out.Grid()->Nd()-1);
  };
  // R = Pdbar - Pdbar Dinv Ddbar 
  void RInv(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    dBoundaryBar(in,out);
    Dinverse(out,tmp1);  
    out =in -tmp1; 
    ProjectBoundaryBar(out);
  };
  // R = Pdbar - DdbarDag DinvDag Pdbar 
  void RDagInv(FermionField &in,FermionField &out)
  {
    FermionField tmp(PeriodicFermOpD.FermionGrid());
    FermionField Pin(PeriodicFermOpD.FermionGrid());
    Pin = in; ProjectBoundaryBar(Pin);
    DinverseDag(Pin,out);  
    dBoundaryBarDag(out,tmp);
    out =Pin -tmp; 
  };
  // Non-dirichlet inverter using red-black preconditioning
  void Dinverse(FermionField &in,FermionField &out)
  {
    MxPCG DSolver(tol,
 		  tolinner,
 		  maxinnerit,
 		  maxouterit,
 		  PeriodicFermOpF.FermionRedBlackGrid(),
 		  PeriodicFermOpF,
 		  PeriodicFermOpD,
 		  PeriodicLinOpF,
 		  PeriodicLinOpD);
    SchurRedBlackDiagMooeeSolve<FermionField> Solve(DSolver);
    Solve(PeriodicFermOpD,in,out);
  }
  void DinverseDag(FermionField &in,FermionField &out)
  {
    MxDagPCG DdagSolver(tol,
 			tolinner,
 			maxinnerit,
 			maxouterit,
 			PeriodicFermOpF.FermionRedBlackGrid(),
 			PeriodicFermOpF,
 			PeriodicFermOpD,
 			PeriodicLinOpDagF,
 			PeriodicLinOpDagD);
    SchurRedBlackDiagMooeeDagSolve<FermionField> Solve(DdagSolver);
    Solve(PeriodicFermOpD,in,out);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -303,9 +303,11 @@ public:
 		int npoints,
 		int checkerboard,
 		const std::vector<int> &directions,
-		const std::vector<int> &distances,Parameters p)  
+		const std::vector<int> &distances,
-    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
+		bool locally_periodic,
-  { 
+		Parameters p)  
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,locally_periodic,p)
  {
    ZeroCountersi();
    surface_list.resize(0);
    this->same_node.resize(npoints);
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@ -146,8 +146,11 @@ public:
  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
  void DhopInternalDirichletComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 				  const FermionField &in, FermionField &out, int dag);
  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-                    const FermionField &in, FermionField &out, int dag);
+				   const FermionField &in, FermionField &out, int dag);
  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
@ -157,7 +160,10 @@ public:
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
-
+  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@ -165,7 +165,14 @@ public:
 			       const FermionField &in, 
 			       FermionField &out,
 			       int dag);
-    
+
  void DhopInternalDirichletComms(StencilImpl & st,
 				  LebesgueOrder &lo,
 				  DoubledGaugeField &U,
 				  const FermionField &in, 
 				  FermionField &out,
 				  int dag);
  // Constructors
  WilsonFermion5D(GaugeField &_Umu,
 		  GridCartesian         &FiveDimGrid,
@ -174,19 +181,11 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
  // Constructors
  /*
    WilsonFermion5D(int simd, 
    GaugeField &_Umu,
    GridCartesian         &FiveDimGrid,
    GridRedBlackCartesian &FiveDimRedBlackGrid,
    GridCartesian         &FourDimGrid,
    double _M5,const ImplParams &p= ImplParams());
  */
  // DoubleStore
  void ImportGauge(const GaugeField &_Umu);
-    
+  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
 class WilsonKernelsStatic { 
 public:
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
-  enum { CommsAndCompute, CommsThenCompute };
+  enum { CommsAndCompute, CommsThenCompute, CommsDirichlet };
  static int Opt;  
  static int Comms;
 };
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@ -112,7 +112,6 @@ void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  imported5d=tmp;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
 {
@ -127,6 +126,37 @@ void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &inpu
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  Dminus(tmp,imported5d);
 }
 ////////////////////////////////////////////////////
 // Added for fourD pseudofermion det estimation
 ////////////////////////////////////////////////////
 template<class Impl>  
 void CayleyFermion5D<Impl>::ImportFourDimPseudoFermion(const FermionField &input4d,FermionField &imported5d)
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  conformable(imported5d.Grid(),this->FermionGrid());
  conformable(input4d.Grid()   ,this->GaugeGrid());
  tmp = Zero();
  InsertSlice(input4d, tmp, 0   , 0);
  InsertSlice(input4d, tmp, Ls-1, 0);
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, 0, 0);
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  imported5d=tmp;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::ExportFourDimPseudoFermion(const FermionField &solution5d,FermionField &exported4d)
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  tmp = solution5d;
  conformable(solution5d.Grid(),this->FermionGrid());
  conformable(exported4d.Grid(),this->GaugeGrid());
  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
  ExtractSlice(exported4d, tmp, 0, 0);
 }
 // Dminus
 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -51,9 +51,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p),
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p.locally_periodic,p),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p.locally_periodic,p), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p.locally_periodic,p), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
@ -361,10 +361,21 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
+
  assert(  (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
 	 ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
         ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet) );
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) {
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  else 
+  }
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute ) {
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  }
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet ) {
    DhopInternalDirichletComms(st,lo,U,in,out,dag);
  }
  DhopTotalTime+=usecond();
 }
@ -431,6 +442,30 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopComputeTime2+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalDirichletComms(StencilImpl & st, LebesgueOrder &lo,
 						       DoubledGaugeField & U,
 						       const FermionField &in, FermionField &out,int dag)
 {
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  accelerator_barrier();
  DhopComputeTime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -47,9 +47,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
    Kernels(p),
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
-    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p.locally_periodic,p),
-    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
+    StencilEven(&Hgrid, npoint, Even, directions,displacements,p.locally_periodic,p),  // source is Even
-    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
+    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p.locally_periodic,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
@ -488,12 +488,21 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       FermionField &out, int dag)
 {
  DhopTotalTime-=usecond();
-#ifdef GRID_OMP
+
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
+  assert(  (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
 	 ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
         ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet) );
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) {
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  else
+  }
-#endif
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute ) {
    DhopInternalSerial(st,lo,U,in,out,dag);
  }
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet ) {
    DhopInternalDirichletComms(st,lo,U,in,out,dag);
  }
  DhopTotalTime+=usecond();
 }
@ -562,6 +571,29 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  DhopComputeTime2+=usecond();
 };
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalDirichletComms(StencilImpl &st, LebesgueOrder &lo,
 						     DoubledGaugeField &U,
 						     const FermionField &in,
 						     FermionField &out, int dag)
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
  int len =  U.Grid()->oSites();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
 };
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@ -61,7 +61,7 @@ NAMESPACE_BEGIN(Grid);
  typedef typename Impl::Field Field;
 // hardcodes the exponential approximation in the template
-template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
+template <class S, int Nrepresentation = Nc, int Nexp = 20 > class GaugeImplTypes {
 public:
  typedef S Simd;
  typedef typename Simd::scalar_type scalar_type;
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@ -40,13 +40,66 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }
    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
       for noise X (aka GaussNoise).
       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
    */
    template<class Field> void InversePowerBoundsCheck(int inv_pow,
 						       int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
 						       Field &GaussNoise,
 						       MultiShiftFunction &ApproxNegPow) 
    {
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Y(FermionGrid);
      Field Z(FermionGrid);
      Field tmp1(FermionGrid), tmp2(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
      tmp1 = X;
      Field* in = &tmp1;
      Field* out = &tmp2;
      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
 	msCG(HermOp, *in, *out); //backwards conventions!
 	if(i!=inv_pow-1) std::swap(in, out);
      }
      Z = *out;
      RealD Nz = norm2(Z);
      HermOp.HermOp(Z,Y);
      RealD Ny = norm2(Y);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
    }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h
@ -0,0 +1,163 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundaryBoson.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), 
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_NumOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag P^dag P phi}
    //
    // NumOp == P
    //
    // Take phi = P^{-1} eta  ; eta = P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField eta(NumOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    //DumpSliceNorm("eta",eta);
    NumOp.RInv(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField Y(NumOp.FermionGrid());
    NumOp.R(Phi,Y);
    RealD action = norm2(Y);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=DerivativeStoppingCondition;
    NumOp.ImportGauge(U);
    GridBase *fgrid = NumOp.FermionGrid();
    GridBase *ugrid = NumOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DoidP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // P^dag P term
    NumOp.dOmegaDagInv(P_Phi,DoidP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidP_Phi,tmp);
    X = DobiDdbPhi;
    Y = DobidDddDoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h
@ -0,0 +1,158 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  RealD InnerStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
  RealD refresh_action;
 public:
  DomainDecomposedBoundaryTwoFlavourPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol = 1.0e-6 )
    : DenOp(_DenOp),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_DenOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag Rdag^-1 R^-1 phi}
    //
    // DenOp == R
    //
    // Take phi = R eta  ; eta = R^-1 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol     =ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField eta(DenOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    DenOp.ProjectBoundaryBar(eta);
    DenOp.R(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
    refresh_action = norm2(eta);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Rdag^-1 R^-1 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField X(DenOp.FermionGrid());
    DenOp.RInv(Phi,X);
    RealD action = norm2(X);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=DerivativeStoppingCondition;
    DenOp.ImportGauge(U);
    GridBase *fgrid = DenOp.FermionGrid();
    GridBase *ugrid = DenOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DiDdb_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinv_Phi(fgrid);    // Vector D in my notes
    FermionField Rinv_Phi(fgrid);
 //   FermionField RinvDagRinv_Phi(fgrid);
 //   FermionField DdbdDidRinv_Phi(fgrid);
    // R^-1 term
    DenOp.dBoundaryBar(Phi,tmp);
    DenOp.Dinverse(tmp,DiDdb_Phi);            // Vector C
    Rinv_Phi = Phi - DiDdb_Phi;
    DenOp.ProjectBoundaryBar(Rinv_Phi); 
    // R^-dagger R^-1 term
    DenOp.DinverseDag(Rinv_Phi,DidRinv_Phi); // Vector D
 /*
    DenOp.dBoundaryBarDag(DidRinv_Phi,DdbdDidRinv_Phi);
    RinvDagRinv_Phi = Rinv_Phi - DdbdDidRinv_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinv_Phi);
 */
    X = DiDdb_Phi;
    Y = DidRinv_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    DumpSliceNorm("force",dSdU);
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h
@ -0,0 +1,237 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp, 
 						       SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,
 						       RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), DenOp(_DenOp),
      Phi(_NumOp.PeriodicFermOpD.FermionGrid()),
      InnerStoppingCondition(_InnerTol),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol)
  {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField eta(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField tmp(NumOp.PeriodicFermOpD.FermionGrid());
    // P(phi) = e^{- phi^dag P^dag Rdag^-1 R^-1 P phi}
    //
    // NumOp == P
    // DenOp == R
    //
    // Take phi = P^{-1} R eta  ; eta = R^-1 P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    DenOp.R(eta,tmp);
    NumOp.RInv(tmp,Phi);
    DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag Rdag^-1 R^-1 P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField X(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField Y(NumOp.PeriodicFermOpD.FermionGrid());
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    NumOp.R(Phi,Y);
    DenOp.RInv(Y,X);
    RealD action = norm2(X);
    //    std::cout << " DD boundary action is " <<action<<std::endl;
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    GridBase *fgrid = NumOp.PeriodicFermOpD.FermionGrid();
    GridBase *ugrid = NumOp.PeriodicFermOpD.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DiDdbP_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinvP_Phi(fgrid);    // Vector D in my notes
    FermionField DdbdDidRinvP_Phi(fgrid);
    FermionField DoidRinvDagRinvP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidRinvDagRinvP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    FermionField RinvP_Phi(fgrid);
    FermionField RinvDagRinvP_Phi(fgrid);
    FermionField PdagRinvDagRinvP_Phi(fgrid);
    //    RealD action = S(U);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = DerivativeStoppingCondition;
    NumOp.tol = DerivativeStoppingCondition;
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // R^-1 P term
    DenOp.dBoundaryBar(P_Phi,tmp);
    DenOp.Dinverse(tmp,DiDdbP_Phi);            // Vector C
    RinvP_Phi = P_Phi - DiDdbP_Phi;
    DenOp.ProjectBoundaryBar(RinvP_Phi); // Correct to here
    // R^-dagger R^-1 P term
    DenOp.DinverseDag(RinvP_Phi,DidRinvP_Phi); // Vector D
    DenOp.dBoundaryBarDag(DidRinvP_Phi,DdbdDidRinvP_Phi);
    RinvDagRinvP_Phi = RinvP_Phi - DdbdDidRinvP_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinvP_Phi);
    // P^dag R^-dagger R^-1 P term
    NumOp.dOmegaDagInv(RinvDagRinvP_Phi,DoidRinvDagRinvP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidRinvDagRinvP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidRinvDagRinvP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidRinvDagRinvP_Phi,tmp);
    PdagRinvDagRinvP_Phi = RinvDagRinvP_Phi- tmp;
    NumOp.ProjectBoundaryBar(PdagRinvDagRinvP_Phi);
    /*
    std::cout << "S eval  "<< action << std::endl;
    std::cout << "S - IP1 "<< innerProduct(Phi,PdagRinvDagRinvP_Phi) << std::endl;
    std::cout << "S - IP2 "<< norm2(RinvP_Phi) << std::endl;
    NumOp.R(Phi,tmp);
    tmp = tmp - P_Phi;
    std::cout << "diff1 "<<norm2(tmp) <<std::endl;
    DenOp.RInv(P_Phi,tmp);
    tmp = tmp - RinvP_Phi;
    std::cout << "diff2 "<<norm2(tmp) <<std::endl;
    DenOp.RDagInv(RinvP_Phi,tmp);
    tmp  = tmp - RinvDagRinvP_Phi;
    std::cout << "diff3 "<<norm2(tmp) <<std::endl;
    DenOp.RDag(RinvDagRinvP_Phi,tmp);
    tmp  = tmp - PdagRinvDagRinvP_Phi;
    std::cout << "diff4 "<<norm2(tmp) <<std::endl;
    */
    dSdU=Zero();
    X = DobiDdbPhi;
    Y = DobidDddDoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DiDdbP_Phi;
    Y = DidRinvP_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@ -0,0 +1,372 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators
    /////////////////////////////////////////////////////////
    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
 	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
 	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
 	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
       BIG WARNING:	   
       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
       Thus for DWF the numerator operator is the Pauli-Villars operator
       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
    */
    template<class Impl>
    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef RationalActionParams Params;
      Params param;
      //For action evaluation
      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
      //For the MD integration
      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
 	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
 	double error = remez.generateApprox(approx_degree,1,inv_pow);	
 	if(error > CG_tolerance)
 	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
 	approx.Init(remez, CG_tolerance,false);
 	approx_inv.Init(remez, CG_tolerance,true);
      }
    protected:
      static constexpr bool Numerator = true;
      static constexpr bool Denominator = false;
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const GaugeField &U){
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
      }
    public:
      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						     FermionOperator<Impl>  &_DenOp, 
 						     const Params & p
 						     ) : 
 	NumOp(_NumOp), 
 	DenOp(_DenOp), 
 	PhiOdd (_NumOp.FermionRedBlackGrid()),
 	PhiEven(_NumOp.FermionRedBlackGrid()),
 	param(p) 
      {
 	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	//Generate approximations for action eval
 	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	//Generate approximations for MD
 	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
 	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	}else{
 	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
 	  ApproxPowerMD = ApproxPowerAction; 
 	  ApproxNegPowerMD = ApproxNegPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
 	  ApproxHalfPowerMD = ApproxHalfPowerAction;
 	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
 	}
 	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
      };
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      //Access the fermion field
      const FermionField &getPhiOdd() const{ return PhiOdd; }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField eta(NumOp.FermionGrid());	
 	// P(eta) \propto e^{- eta^dag eta}
 	//	
 	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
 	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
 	RealD scale = std::sqrt(0.5);
 	gaussian(pRNG,eta);	eta=eta*scale;
 	refresh(U,eta);
      }
      //Allow for manual specification of random field for testing
      void refresh(const GaugeField &U, const FermionField &eta) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
 	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
 	//
 	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	ImportGauge(U);
 	// MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
 	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
 	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
 	ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
 	// Randomly apply rational bounds checks.
 	int rcheck = rand();
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
 	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
 	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
 	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
 	}
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	RealD action = norm2(Y);
 	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
 	const int n_f  = ApproxNegPowerMD.poles.size();
 	const int n_pv = ApproxHalfPowerMD.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	ImportGauge(U);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)	
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
 	for(int k=0;k<n_f;k++){
 	  ak = ApproxNegPowerMD.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
 	for(int k=0;k<n_pv;k++){
          ak = ApproxHalfPowerMD.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
 	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
      };
    };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@ -0,0 +1,93 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
    // cf. GeneralEvenOddRational.h for details
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class ImplD, class ImplF>
    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
    private:
      typedef typename ImplD::FermionField FermionFieldD;
      typedef typename ImplF::FermionField FermionFieldF;
      FermionOperator<ImplD> & NumOpD;
      FermionOperator<ImplD> & DenOpD;
      FermionOperator<ImplF> & NumOpF;
      FermionOperator<ImplF> & DenOpF;
      Integer ReliableUpdateFreq;
    protected:
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
 	precisionChange(Uf, Ud);
 	NumOpD.ImportGauge(Ud);
 	DenOpD.ImportGauge(Ud);
 	NumOpF.ImportGauge(Uf);
 	DenOpF.ImportGauge(Uf);
      }
    public:
      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
 							      const RationalActionParams & p, Integer _ReliableUpdateFreq
 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
 								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
    };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@ -40,249 +40,31 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
-     
+      static RationalActionParams transcribe(const Params &in){
-      FermionOperator<Impl> & NumOp;// the basic operator
+	RationalActionParams out;
-      FermionOperator<Impl> & DenOp;// the basic operator
+	out.inv_pow = 2;
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
+	out.lo = in.lo;
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+	out.hi = in.hi;
 	out.MaxIter = in.MaxIter;
 	out.action_tolerance = out.md_tolerance = in.tolerance;
 	out.action_degree = out.md_degree = in.degree;
 	out.precision = in.precision;
 	out.BoundsCheckFreq = in.BoundsCheckFreq;
 	return out;
      }
    public:
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-					    FermionOperator<Impl>  &_DenOp, 
+							FermionOperator<Impl>  &_DenOp, 
-					    Params & p
+							const Params & p
-					    ) : 
+							) : 
-      NumOp(_NumOp), 
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}
      DenOp(_DenOp), 
      PhiOdd (_NumOp.FermionRedBlackGrid()),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
-	// MdagM^(+- 1/2)
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
 	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
 	//
 	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);	eta=eta*scale;
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
 	msCG_M(MdagM,etaOdd,tmp);
 	// VdagV^-1/4 MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
 	msCG_V(VdagV,tmp,PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	msCG_V(VdagV,PhiOdd,X);
 	// MdagM^-1/4 VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);
 	// Randomly apply rational bounds checks.
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
 	}
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int n_f  = PowerNegHalf.poles.size();
 	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)
 	for(int k=0;k<n_f;k++){
 	  ak = PowerNegHalf.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	for(int k=0;k<n_pv;k++){
          ak = PowerQuarter.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
      };
    };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@ -26,8 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef QCD_PSEUDOFERMION_AGGREGATE_H
+#pragma once
 #define QCD_PSEUDOFERMION_AGGREGATE_H
 // Rational functions
 #include <Grid/qcd/action/pseudofermion/Bounds.h>
@ -40,7 +39,14 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
 #include <Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h>
 #include <Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h>
 #include <Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h>
 #endif
--- a/Grid/qcd/action/pseudofermion/TwoFlavour.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavour.h
@ -98,6 +98,7 @@ public:
    FermOp.ImportGauge(U);
    FermOp.Mdag(eta, Phi);
    std::cout << GridLogMessage << "Pseudofermion action refresh " << norm2(eta) << std::endl;
  };
  //////////////////////////////////////////////////////
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@ -50,6 +50,8 @@ NAMESPACE_BEGIN(Grid);
      FermionField PhiOdd;   // the pseudo fermion field for this trajectory
      FermionField PhiEven;  // the pseudo fermion field for this trajectory
      virtual void refreshRestrict(FermionField &eta) {};
    public:
      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
                                                FermionOperator<Impl>  &_DenOp, 
@ -60,7 +62,8 @@ NAMESPACE_BEGIN(Grid);
      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
                                                FermionOperator<Impl>  &_DenOp, 
                                                OperatorFunction<FermionField> & DS,
-                                                OperatorFunction<FermionField> & AS, OperatorFunction<FermionField> & HS) :
+                                                OperatorFunction<FermionField> & AS,
 						OperatorFunction<FermionField> & HS) :
      NumOp(_NumOp), 
      DenOp(_DenOp), 
      DerivativeSolver(DS), 
@ -83,16 +86,7 @@ NAMESPACE_BEGIN(Grid);
 	return sstream.str();
      } 
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        //
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@ -100,12 +94,23 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);
        FermionField eta    (NumOp.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refreshRestrict(eta); // Used by DDHMC
 	refresh(U,eta);
      }
      void refresh(const GaugeField &U, const FermionField &eta) {
        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());
        gaussian(pRNG,eta);
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);
@ -125,8 +130,9 @@ NAMESPACE_BEGIN(Grid);
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);
-        PhiOdd =PhiOdd*scale;
+        //PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
+        //PhiEven=PhiEven*scale;
 	std::cout << GridLogMessage<<" TwoFlavourEvenOddRatio Expect action to be "<<norm2(etaOdd) + norm2(etaEven)<<std::endl;
      };
@ -161,6 +167,8 @@ NAMESPACE_BEGIN(Grid);
        DenOp.MooeeInvDag(X,Y);
        action = action + norm2(Y);
 	std::cout << GridLogMessage<<" TwoFlavourEvenOddRatio action is "<<action<<std::endl;
        return action;
      };
@ -173,7 +181,7 @@ NAMESPACE_BEGIN(Grid);
        NumOp.ImportGauge(U);
        DenOp.ImportGauge(U);
-
+	
        SchurDifferentiableOperator<Impl> Mpc(DenOp);
        SchurDifferentiableOperator<Impl> Vpc(NumOp);
@ -208,7 +216,7 @@ NAMESPACE_BEGIN(Grid);
        assert(DenOp.ConstEE() == 1);
        dSdU = -dSdU;
-        
+
      };
    };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
@ -99,7 +99,7 @@ public:
    NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
    Phi=Phi*scale;
-	
+    std::cout << GridLogMessage<<" TwoFlavourRatio Expect action to be "<<norm2(eta)*scale*scale<<std::endl;
  };
  //////////////////////////////////////////////////////
@ -121,6 +121,7 @@ public:
    DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
    RealD action = norm2(Y);
    std::cout << GridLogMessage<<" TwoFlavourRatio action is "<<action<<std::endl;
    return action;
  };
--- a/Grid/qcd/action/pseudofermion/TwoFlavourRatio4DPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatio4DPseudoFermion.h
@ -0,0 +1,197 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class Impl>
 class TwoFlavourRatio4DPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);
 private:
  FermionOperator<Impl> & NumOp;// the basic operator
  FermionOperator<Impl> & DenOp;// the basic operator
  OperatorFunction<FermionField> &DerivativeSolver;
  OperatorFunction<FermionField> &ActionSolver;
  FermionField phi4; // the pseudo fermion field for this trajectory
 public:
  TwoFlavourRatio4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 				       FermionOperator<Impl>  &_DenOp, 
 				       OperatorFunction<FermionField> & DS,
 				       OperatorFunction<FermionField> & AS
 				       ) : NumOp(_NumOp),
 					   DenOp(_DenOp),
 					   DerivativeSolver(DS),
 					   ActionSolver(AS),
 					   phi4(_NumOp.GaugeGrid())
  {};
  virtual std::string action_name(){return "TwoFlavourRatio4DPseudoFermionAction";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
    // P(phi) = e^{- phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi}
    //
    // NumOp == V
    // DenOp == M
    //
    // Take phi = (V^{-1} M)_11 eta  ; eta = (M^{-1} V)_11 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    FermionField eta4(NumOp.GaugeGrid());
    FermionField eta5(NumOp.FermionGrid());
    FermionField tmp(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    gaussian(pRNG,eta4);
    NumOp.ImportFourDimPseudoFermion(eta4,eta5);
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(NumOp);
    DenOp.M(eta5,phi5);               // M eta
    NumOp.Mdag(phi5,tmp);            // Vdag M eta
    phi5 = Zero();
    ActionSolver(MdagMOp,tmp,phi5);  // (VdagV)^-1 M eta = V^-1 Vdag^-1 Vdag M eta = V^-1 M eta
    phi5=phi5*scale;
    // Project to 4d
    NumOp.ExportFourDimPseudoFermion(phi5,phi4);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField Y4(NumOp.GaugeGrid());
    FermionField X(NumOp.FermionGrid());
    FermionField Y(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    NumOp.ImportFourDimPseudoFermion(phi4,phi5);
    NumOp.M(phi5,Y);              // Y= V phi
    DenOp.Mdag(Y,X);              // X= Mdag V phi
    Y=Zero();
    ActionSolver(MdagMOp,X,Y);    // Y= (MdagM)^-1 Mdag Vdag phi = M^-1 V phi
    NumOp.ExportFourDimPseudoFermion(Y,Y4);
    RealD action = norm2(Y4);
    return action;
  };
  //////////////////////////////////////////////////////
  // dS/du = 2 Re phi^dag (V^dag M^-dag)_11  (M^-1 d V)_11  phi
  //       - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    FermionField  X(NumOp.FermionGrid());
    FermionField  Y(NumOp.FermionGrid());
    FermionField       phi(NumOp.FermionGrid());
    FermionField      Vphi(NumOp.FermionGrid());
    FermionField  MinvVphi(NumOp.FermionGrid());
    FermionField      tmp4(NumOp.GaugeGrid());
    FermionField  MdagInvMinvVphi(NumOp.FermionGrid());
    GaugeField   force(NumOp.GaugeGrid());	
    //Y=V phi
    //X = (Mdag V phi
    //Y = (Mdag M)^-1 Mdag V phi = M^-1 V Phi
    NumOp.ImportFourDimPseudoFermion(phi4,phi);
    NumOp.M(phi,Vphi);               //  V phi
    DenOp.Mdag(Vphi,X);              // X=  Mdag V phi
    Y=Zero();
    DerivativeSolver(MdagMOp,X,MinvVphi);// M^-1 V phi
    // Projects onto the physical space and back
    NumOp.ExportFourDimPseudoFermion(MinvVphi,tmp4);
    NumOp.ImportFourDimPseudoFermion(tmp4,Y);
    X=Zero();
    DerivativeSolver(MdagMOp,Y,X);// X = (MdagM)^-1 proj M^-1 V phi
    DenOp.M(X,MdagInvMinvVphi);
    // phi^dag (Vdag Mdag^-1) (M^-1 dV)  phi
    NumOp.MDeriv(force ,MdagInvMinvVphi , phi, DaggerNo );  dSdU=force;
    // phi^dag (dVdag Mdag^-1) (M^-1 V)  phi
    NumOp.MDeriv(force , phi, MdagInvMinvVphi ,DaggerYes  );  dSdU=dSdU+force;
    //    - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
    DenOp.MDeriv(force,MdagInvMinvVphi,MinvVphi,DaggerNo);   dSdU=dSdU-force;
    DenOp.MDeriv(force,MinvVphi,MdagInvMinvVphi,DaggerYes);  dSdU=dSdU-force;
    dSdU *= -1.0; 
    //dSdU = - Ta(dSdU);
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/TwoFlavourRatioEO4DPseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatioEO4DPseudoFermion.h
@ -0,0 +1,203 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class Impl>
 class TwoFlavourRatioEO4DPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);
 private:
  typedef FermionOperator<Impl> FermOp;
  FermionOperator<Impl> & NumOp;// the basic operator
  FermionOperator<Impl> & DenOp;// the basic operator
  OperatorFunction<FermionField> &DerivativeSolver;
  OperatorFunction<FermionField> &DerivativeDagSolver;
  OperatorFunction<FermionField> &ActionSolver;
  OperatorFunction<FermionField> &HeatbathSolver;
  FermionField phi4; // the pseudo fermion field for this trajectory
 public:
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & AS ) : 
    TwoFlavourRatioEO4DPseudoFermionAction(_NumOp,_DenOp, DS,DS,AS,AS) {};
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & DDS,
 					 OperatorFunction<FermionField> & AS,
 					 OperatorFunction<FermionField> & HS
 				       ) : NumOp(_NumOp),
 					   DenOp(_DenOp),
 					   DerivativeSolver(DS),
 					   DerivativeDagSolver(DDS),
 					   ActionSolver(AS),
 					   HeatbathSolver(HS),
 					   phi4(_NumOp.GaugeGrid())
  {};
  virtual std::string action_name(){return "TwoFlavourRatioEO4DPseudoFermionAction";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
    // P(phi) = e^{- phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi}
    //
    // NumOp == V
    // DenOp == M
    //
    // Take phi = (V^{-1} M)_11 eta  ; eta = (M^{-1} V)_11 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    FermionField eta4(NumOp.GaugeGrid());
    FermionField eta5(NumOp.FermionGrid());
    FermionField tmp(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    gaussian(pRNG,eta4);
    NumOp.ImportFourDimPseudoFermion(eta4,eta5);
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(HeatbathSolver);
    DenOp.M(eta5,tmp);               // M eta
    PrecSolve(NumOp,tmp,phi5);  // phi = V^-1 M eta
    phi5=phi5*scale;
    std::cout << GridLogMessage << "4d pf refresh "<< norm2(phi5)<<"\n";
    // Project to 4d
    NumOp.ExportFourDimPseudoFermion(phi5,phi4);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField Y4(NumOp.GaugeGrid());
    FermionField X(NumOp.FermionGrid());
    FermionField Y(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(ActionSolver);
    NumOp.ImportFourDimPseudoFermion(phi4,phi5);
    NumOp.M(phi5,X);              // X= V phi
    PrecSolve(DenOp,X,Y);    // Y= (MdagM)^-1 Mdag Vdag phi = M^-1 V phi
    NumOp.ExportFourDimPseudoFermion(Y,Y4);
    RealD action = norm2(Y4);
    return action;
  };
  //////////////////////////////////////////////////////
  // dS/du = 2 Re phi^dag (V^dag M^-dag)_11  (M^-1 d V)_11  phi
  //       - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField  X(NumOp.FermionGrid());
    FermionField  Y(NumOp.FermionGrid());
    FermionField       phi(NumOp.FermionGrid());
    FermionField      Vphi(NumOp.FermionGrid());
    FermionField  MinvVphi(NumOp.FermionGrid());
    FermionField      tmp4(NumOp.GaugeGrid());
    FermionField  MdagInvMinvVphi(NumOp.FermionGrid());
    GaugeField   force(NumOp.GaugeGrid());	
    //Y=V phi
    //X = (Mdag V phi
    //Y = (Mdag M)^-1 Mdag V phi = M^-1 V Phi
    NumOp.ImportFourDimPseudoFermion(phi4,phi);
    NumOp.M(phi,Vphi);               //  V phi
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(DerivativeSolver);
    PrecSolve(DenOp,Vphi,MinvVphi);// M^-1 V phi
    std::cout << GridLogMessage << "4d deriv solve "<< norm2(MinvVphi)<<"\n";
    // Projects onto the physical space and back
    NumOp.ExportFourDimPseudoFermion(MinvVphi,tmp4);
    NumOp.ImportFourDimPseudoFermion(tmp4,Y);
    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecDagSolve(DerivativeDagSolver);
    // X = proj M^-dag V phi
    // Need an adjoint solve
    PrecDagSolve(DenOp,Y,MdagInvMinvVphi);
    std::cout << GridLogMessage << "4d deriv solve dag "<< norm2(MdagInvMinvVphi)<<"\n";
    // phi^dag (Vdag Mdag^-1) (M^-1 dV)  phi
    NumOp.MDeriv(force ,MdagInvMinvVphi , phi, DaggerNo );  dSdU=force;
    // phi^dag (dVdag Mdag^-1) (M^-1 V)  phi
    NumOp.MDeriv(force , phi, MdagInvMinvVphi ,DaggerYes  );  dSdU=dSdU+force;
    //    - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
    DenOp.MDeriv(force,MdagInvMinvVphi,MinvVphi,DaggerNo);   dSdU=dSdU-force;
    DenOp.MDeriv(force,MinvVphi,MdagInvMinvVphi,DaggerYes);  dSdU=dSdU-force;
    dSdU *= -1.0; 
    //dSdU = - Ta(dSdU);
  };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/Gparity.h
+++ b/Grid/qcd/gparity/Gparity.h
@ -0,0 +1,6 @@
 #ifndef GRID_GPARITY_H_
 #define GRID_GPARITY_H_
 #include<Grid/qcd/gparity/GparityFlavour.h>
 #endif
--- a/Grid/qcd/gparity/GparityFlavour.cc
+++ b/Grid/qcd/gparity/GparityFlavour.cc
@ -0,0 +1,34 @@
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
    GparityFlavour(GparityFlavour::Algebra::SigmaX),
    GparityFlavour(GparityFlavour::Algebra::SigmaY),
    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
    }};
 const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
  GparityFlavour(GparityFlavour::Algebra::Identity),
  GparityFlavour(GparityFlavour::Algebra::SigmaX),
  GparityFlavour(GparityFlavour::Algebra::SigmaY),
  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
 }};
 const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
    "SigmaX",
    "MinusSigmaX",
    "SigmaY",
    "MinusSigmaY",
    "SigmaZ",
    "MinusSigmaZ",
    "Identity",
    "MinusIdentity",
    "ProjPlus",
    "MinusProjPlus",
    "ProjMinus",
    "MinusProjMinus"}};
 NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/GparityFlavour.h
+++ b/Grid/qcd/gparity/GparityFlavour.h
@ -0,0 +1,475 @@
 #ifndef GRID_QCD_GPARITY_FLAVOUR_H
 #define GRID_QCD_GPARITY_FLAVOUR_H
 //Support for flavour-matrix operations acting on the G-parity flavour index
 #include <array>
 NAMESPACE_BEGIN(Grid);
 class GparityFlavour {
  public:
    GRID_SERIALIZABLE_ENUM(Algebra, undef,
                           SigmaX, 0,
 			   MinusSigmaX, 1,
                           SigmaY, 2,
 			   MinusSigmaY, 3,
                           SigmaZ, 4,
 			   MinusSigmaZ, 5,
 			   Identity, 6,
 			   MinusIdentity, 7,
 			   ProjPlus, 8,
 			   MinusProjPlus, 9,
 			   ProjMinus, 10,
 			   MinusProjMinus, 11
 			   );
    static constexpr unsigned int nSigma = 12;
    static const std::array<const char *, nSigma>                name;
    static const std::array<const GparityFlavour, 3>             sigma_mu;
    static const std::array<const GparityFlavour, 6>            sigma_all;
    Algebra                                                      g;
  public:
  accelerator GparityFlavour(Algebra initg): g(initg) {}  
 };
 // 0 1  x   vector
 // 1 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(1);
  ret(1) = rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(1,0);
  ret(0,1) = rhs(1,1);
  ret(1,0) = rhs(0,0);
  ret(1,1) = rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,1);
  ret(0,1) = rhs(0,0);
  ret(1,0) = rhs(1,1);
  ret(1,1) = rhs(1,0);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(1);
  ret(1) = -rhs(0);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(1,0);
  ret(0,1) = -rhs(1,1);
  ret(1,0) = -rhs(0,0);
  ret(1,1) = -rhs(0,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,1);
  ret(0,1) = -rhs(0,0);
  ret(1,0) = -rhs(1,1);
  ret(1,1) = -rhs(1,0);
 };
 // 0 -i  x   vector
 // i 0
 template<class vtype>
 accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesMinusI(rhs(1));
  ret(1) = timesI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(1,0));
  ret(0,1) = timesMinusI(rhs(1,1));
  ret(1,0) = timesI(rhs(0,0));
  ret(1,1) = timesI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(0,1));
  ret(0,1) = timesMinusI(rhs(0,0));
  ret(1,0) = timesI(rhs(1,1));
  ret(1,1) = timesMinusI(rhs(1,0));
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = timesI(rhs(1));
  ret(1) = timesMinusI(rhs(0));
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesI(rhs(1,0));
  ret(0,1) = timesI(rhs(1,1));
  ret(1,0) = timesMinusI(rhs(0,0));
  ret(1,1) = timesMinusI(rhs(0,1));
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = timesMinusI(rhs(0,1));
  ret(0,1) = timesI(rhs(0,0));
  ret(1,0) = timesMinusI(rhs(1,1));
  ret(1,1) = timesI(rhs(1,0));
 };
 // 1 0  x   vector
 // 0 -1
 template<class vtype>
 accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = rhs(0);
  ret(1) = rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = rhs(0,0);
  ret(0,1) = rhs(0,1);
  ret(1,0) = rhs(1,0);
  ret(1,1) = rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -rhs(0);
  ret(1) = -rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -rhs(0,0);
  ret(0,1) = -rhs(0,1);
  ret(1,0) = -rhs(1,0);
  ret(1,1) = -rhs(1,1);
 };
 //G-parity flavour projection 1/2(1+\sigma_2)
 //1 -i
 //i  1
 template<class vtype>
 accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 //G-parity flavour projection 1/2(1-\sigma_2)
 //1 i
 //-i  1
 template<class vtype>
 accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
 {
  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
 };
 template<class vtype>
 accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
 };
 template<class vtype>
 accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
 {
  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
 };
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
 {
  iVector<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    multFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    multFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    multFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    multFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    multFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    multFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    multFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    multFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    multFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    multFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    multFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    multFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    lmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    lmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    lmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    lmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    lmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    lmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    lmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    lmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    lmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    lmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    lmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    lmultFlavourMinusProjMinus(ret, arg); break;  
  default: assert(0);
  }
  return ret;
 }
 template<class vtype> 
 accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
 ->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
 {
  iMatrix<vtype, Ngp> ret;
  switch (G.g) 
  {
  case GparityFlavour::Algebra::SigmaX:
    rmultFlavourSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaX:
    rmultFlavourMinusSigmaX(ret, arg); break;
  case GparityFlavour::Algebra::SigmaY:
    rmultFlavourSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaY:
    rmultFlavourMinusSigmaY(ret, arg); break;
  case GparityFlavour::Algebra::SigmaZ:
    rmultFlavourSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::MinusSigmaZ:
    rmultFlavourMinusSigmaZ(ret, arg); break;
  case GparityFlavour::Algebra::Identity:
    rmultFlavourIdentity(ret, arg); break;
  case GparityFlavour::Algebra::MinusIdentity:
    rmultFlavourMinusIdentity(ret, arg); break;
  case GparityFlavour::Algebra::ProjPlus:
    rmultFlavourProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjPlus:
    rmultFlavourMinusProjPlus(ret, arg); break;
  case GparityFlavour::Algebra::ProjMinus:
    rmultFlavourProjMinus(ret, arg); break;
  case GparityFlavour::Algebra::MinusProjMinus:
    rmultFlavourMinusProjMinus(ret, arg); break;
  default: assert(0);
  }
  return ret;
 }
 NAMESPACE_END(Grid);
 #endif // include guard
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@ -129,18 +129,10 @@ public:
    Runner(S);
  }
-  //////////////////////////////////////////////////////////////////
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
-
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
-private:
+  void initializeGaugeFieldAndRNGs(Field &U){
-  template <class SmearingPolicy>
+    if(!Resources.haveRNGs()) Resources.AddRNGs();
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Resources.AddRNGs();
    Field U(UGrid);
    // Can move this outside?
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    if (Parameters.StartingType == "HotStart") {
      // Hot start
@ -167,6 +159,25 @@ private:
 	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
      exit(1);
    }
  }
  //////////////////////////////////////////////////////////////////
 private:
  template <class SmearingPolicy>
  void Runner(SmearingPolicy &Smearing) {
    auto UGrid = Resources.GetCartesian();
    Field U(UGrid);
    initializeGaugeFieldAndRNGs(U);
    typedef IntegratorType<SmearingPolicy> TheIntegrator;
    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
    // Sets the momentum filter
    MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter()));
    Smearing.set_Field(U);
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@ -34,6 +34,7 @@ directory
 			    * @brief Classes for Hybrid Monte Carlo update
 			    *
 			    * @author Guido Cossu
 			    * @author Peter Boyle
 			    */
 			   //--------------------------------------------------------------------
 #pragma once
@ -115,22 +116,17 @@ private:
    random(sRNG, rn_test);
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC << "--------------------------------------------------\n";
-              << "--------------------------------------------------\n";
+    std::cout << GridLogHMC << "exp(-dH) = " << prob << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
+    std::cout << GridLogHMC << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
              << "  Random = " << rn_test << "\n";
    std::cout << GridLogMessage
              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "--------------------------------------------------\n";
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "--------------------------------------------------\n";
                << "--------------------------------------------------\n";
      return false;
    }
  }
@ -139,19 +135,68 @@ private:
  // Evolution
  /////////////////////////////////////////////////////////
  RealD evolve_hmc_step(Field &U) {
    TheIntegrator.refresh(U, sRNG, pRNG);  // set U and initialize P and phi's
-    RealD H0 = TheIntegrator.S(U);  // initial state action
+    GridBase *Grid = U.Grid();
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // Mainly for DDHMC perform a random translation of U modulo volume
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Random shifting gauge field by [";
    for(int d=0;d<Grid->Nd();d++) {
      int L = Grid->GlobalDimensions()[d];
      RealD rn_uniform;  random(sRNG, rn_uniform);
      int shift = (int) (rn_uniform*L);
      std::cout << shift;
      if(d<Grid->Nd()-1) std::cout <<",";
      else               std::cout <<"]\n";
      U = Cshift(U,d,shift);
    }
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    TheIntegrator.reset_timer();
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // set U and initialize P and phi's
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Refresh momenta and pseudofermions";
    TheIntegrator.refresh(U, sRNG, pRNG);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // initial state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Compute initial action";
    RealD H0 = TheIntegrator.S(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << " Molecular Dynamics evolution ";
    TheIntegrator.integrate(U);
    std::cout << GridLogMessage << "--------------------------------------------------\n";
-    RealD H1 = TheIntegrator.S(U);  // updated state action
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // updated state action
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    std::cout << GridLogMessage << "Compute final action";
    RealD H1 = TheIntegrator.S(U);  
    std::cout << GridLogMessage << "--------------------------------------------------\n";
    ///////////////////////////////////////////////////////////
    if(0){
      std::cout << "------------------------- Reversibility test" << std::endl;
@ -163,17 +208,16 @@ private:
    }
    ///////////////////////////////////////////////////////////
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+
-	      << "  dH = " << H1 - H0 << "\n";
+    std::cout << GridLogHMC << "--------------------------------------------------\n";
    std::cout << GridLogHMC << "Total H after trajectory  = " << H1 << "  dH = " << H1 - H0 << "\n";
    std::cout << GridLogHMC << "--------------------------------------------------\n";
    std::cout.precision(current_precision);
    return (H1 - H0);
  }
 public:
  /////////////////////////////////////////
@ -195,10 +239,13 @@ public:
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+
      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
      double t0=usecond();
@ -207,20 +254,19 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
      }
      if (accept)
        Ucur = Ucopy; 
      double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
      TheIntegrator.print_timer();
      for (int obs = 0; obs < Observables.size(); obs++) {
      	std::cout << GridLogDebug << "Observables # " << obs << std::endl;
@ -228,7 +274,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }
--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@ -72,6 +72,8 @@ class HMCResourceManager {
  typedef HMCModuleBase< BaseHmcCheckpointer<ImplementationPolicy> > CheckpointerBaseModule;
  typedef HMCModuleBase< HmcObservable<typename ImplementationPolicy::Field> > ObservableBaseModule;
  typedef ActionModuleBase< Action<typename ImplementationPolicy::Field>, GridModule > ActionBaseModule;
  typedef typename ImplementationPolicy::Field MomentaField;
  typedef typename ImplementationPolicy::Field Field;  
  // Named storage for grid pairs (std + red-black)
  std::unordered_map<std::string, GridModule> Grids;
@ -80,6 +82,9 @@ class HMCResourceManager {
  // SmearingModule<ImplementationPolicy> Smearing;
  std::unique_ptr<CheckpointerBaseModule> CP;
  // Momentum filter
  std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> > Filter;
  // A vector of HmcObservable modules
  std::vector<std::unique_ptr<ObservableBaseModule> > ObservablesList;
@ -90,6 +95,7 @@ class HMCResourceManager {
  bool have_RNG;
  bool have_CheckPointer;
  bool have_Filter;
  // NOTE: operator << is not overloaded for std::vector<string> 
  // so this function is necessary
@ -101,7 +107,7 @@ class HMCResourceManager {
 public:
-  HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {}
+  HMCResourceManager() : have_RNG(false), have_CheckPointer(false), have_Filter(false) {}
  template <class ReaderClass, class vector_type = vComplex >
  void initialize(ReaderClass &Read){
@ -129,6 +135,7 @@ public:
    RNGModuleParameters RNGpar(Read);
    SetRNGSeeds(RNGpar);
    // Observables
    auto &ObsFactory = HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance(); 
    Read.push(observable_string);// here must check if existing...
@ -208,6 +215,16 @@ public:
    AddGrid(s, Mod);
  }
  void SetMomentumFilter( MomentumFilterBase<typename ImplementationPolicy::Field> * MomFilter) {
    assert(have_Filter==false);
    Filter = std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> >(MomFilter);
    have_Filter = true;
  }
  MomentumFilterBase<typename ImplementationPolicy::Field> *GetMomentumFilter(void) {
    if ( !have_Filter)
      SetMomentumFilter(new MomentumFilterNone<typename ImplementationPolicy::Field>());
    return Filter.get();
  }
  GridCartesian* GetCartesian(std::string s = "") {
    if (s.empty()) s = Grids.begin()->first;
@ -226,6 +243,9 @@ public:
  //////////////////////////////////////////////////////
  // Random number generators
  //////////////////////////////////////////////////////
  //Return true if the RNG objects have been instantiated
  bool haveRNGs() const{ return have_RNG; }
  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@ -33,7 +33,6 @@ directory
 #define INTEGRATOR_INCLUDED
 #include <memory>
 #include "MomentumFilter.h"
 NAMESPACE_BEGIN(Grid);
@ -67,6 +66,7 @@ public:
 template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy>
 class Integrator {
 protected:
  typedef typename FieldImplementation::Field MomentaField;  //for readability
  typedef typename FieldImplementation::Field Field;
@ -119,6 +119,7 @@ protected:
    }
  } update_P_hireps{};
  void update_P(MomentaField& Mom, Field& U, int level, double ep) {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing
@ -130,25 +131,45 @@ protected:
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();
      as[level].actions.at(a)->deriv_timer_start();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
      as[level].actions.at(a)->deriv_timer_stop();
      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      auto name = as[level].actions.at(a)->action_name();
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      DumpSliceNorm("force before Ta",force,Nd-1);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
+
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+      DumpSliceNorm("force before filter",force,Nd-1);
      MomFilter->applyFilter(force);
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
      Real force_max   = std::sqrt(maxLocalNorm2(force));
      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
      as[level].actions.at(a)->deriv_log(force_abs,force_max);
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average  : " << impulse_abs <<" "<<name<<std::endl;
      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max      : " << impulse_max <<" "<<name<<std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
      DumpSliceNorm("force after filter",force,Nd-1);
    }
    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
    MomFilter->applyFilter(Mom);
  }
  void update_U(Field& U, double ep) 
@ -162,8 +183,12 @@ protected:
  void update_U(MomentaField& Mom, Field& U, double ep) 
  {
    MomentaField MomFiltered(Mom.Grid());
    MomFiltered = Mom;
    MomFilter->applyFilter(MomFiltered);
    // exponential of Mom*U in the gauge fields case
-    FieldImplementation::update_field(Mom, U, ep);
+    FieldImplementation::update_field(MomFiltered, U, ep);
    // Update the smeared fields, can be implemented as observer
    Smearer.set_Field(U);
@ -206,6 +231,66 @@ public:
  const MomentaField & getMomentum() const{ return P; }
  void reset_timer(void)
  {
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        as[level].actions.at(actionID)->reset_timer();
      }
    }
  }
  void print_timer(void)
  {
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::" << std::endl;
    std::cout << GridLogMessage << " Refresh cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->refresh_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Action cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->S_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Force cumulative timings "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] "
 		  << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
      }
    }
    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
    std::cout << GridLogMessage << " Force average size "<<std::endl;
    std::cout << GridLogMessage << "------------------------- "<<std::endl;
    for (int level = 0; level < as.size(); ++level) {
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 	std::cout << GridLogMessage 
 		  << as[level].actions.at(actionID)->action_name()
 		  <<"["<<level<<"]["<< actionID<<"] : "
 		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
 		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
 		  << std::endl;
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
  void print_parameters()
  {
    std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
@ -224,7 +309,6 @@ public:
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
  }
  void reverse_momenta()
@ -267,15 +351,19 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
 	auto name = as[level].actions.at(actionID)->action_name();
        std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
 	as[level].actions.at(actionID)->refresh_timer_start();
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
 	as[level].actions.at(actionID)->refresh_timer_stop();
      }
      // Refresh the higher representation actions
      as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
    }
    MomFilter->applyFilter(P);
  }
  // to be used by the actionlevel class to iterate
@ -310,7 +398,9 @@ public:
        // based on the boolean is_smeared in actionID
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 	        as[level].actions.at(actionID)->S_timer_start();
        Hterm = as[level].actions.at(actionID)->S(Us);
   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
      }
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@ -182,7 +182,7 @@ namespace ConjugateBC {
    GridBase *grid = Link.Grid();
    int Lmu = grid->GlobalDimensions()[mu] - 1;
-    Lattice<iScalar<vInteger>> coor(grid);
+    Lattice<iScalar<vInteger> > coor(grid);
    LatticeCoordinate(coor, mu);
    Lattice<gauge> tmp(grid);
--- a/Grid/qcd/utils/MixedPrecisionOperatorFunction.h
+++ b/Grid/qcd/utils/MixedPrecisionOperatorFunction.h
@ -0,0 +1,111 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: 
 Copyright (C) 2015-2016
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid); 
 template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
 class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
 public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid;
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
     MixedPrecisionConjugateGradientOperatorFunction(RealD tol, RealD tolInner,
 						    Integer maxinnerit, 
 						    Integer maxouterit,
 						    GridBase *_SinglePrecGrid,
                                                    FermionOperatorF &_FermOpF,
                                                    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD) : 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tolInner), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid(_SinglePrecGrid),
      OuterLoopNormMult(100.) 
  { assert(tolInner<0.01);    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi)
    {
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      // Assumption made in code to extract gauge field
      // We could avoid storing LinopD reference alltogether ?
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      ////////////////////////////////////////////////////////////////////////////////////
      // Moving this to a Clone method of fermion operator would allow to duplicate the 
      // physics parameters and decrease gauge field copies
      ////////////////////////////////////////////////////////////////////////////////////
      auto &Umu_d = FermOpD.GetDoubledGaugeField();
      auto &Umu_f = FermOpF.GetDoubledGaugeField();
      auto &Umu_fe= FermOpF.GetDoubledGaugeFieldE();
      auto &Umu_fo= FermOpF.GetDoubledGaugeFieldO();
      precisionChange(Umu_f,Umu_d);
      pickCheckerboard(Even,Umu_fe,Umu_f);
      pickCheckerboard(Odd ,Umu_fo,Umu_f);
      //////////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      //////////////////////////////////////////////////////////////////////////////////////////
      // Could assume red black solver here and remove the SinglePrecGrid parameter???
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance, InnerTolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient src "<<norm2(src) <<std::endl;
      psi=Zero();
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid); 
--- a/Grid/sitmo_rng/README
+++ b/Grid/sitmo_rng/README
--- a/Grid/random/gaussian.h
+++ b/Grid/random/gaussian.h
@ -0,0 +1,200 @@
 // -*- C++ -*-
 //===--------------------------- random -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // Peter Boyle: Taken from libc++ in Clang/LLVM.
 // Reason is that libstdc++ and clang differ in their return order in the normal_distribution / box mueller type step.
 // standardise on one and call it "gaussian_distribution".
 #pragma once
 #include <cstddef>
 #include <cstdint>
 #include <cmath>
 #include <type_traits>
 #include <initializer_list>
 #include <limits>
 #include <algorithm>
 #include <numeric>
 #include <vector>
 #include <string>
 #include <istream>
 #include <ostream>
 #include <random>
 // normal_distribution -> gaussian distribution
 namespace Grid {
 template<class _RealType = double>
 class  gaussian_distribution
 {
 public:
    // types
    typedef _RealType result_type;
    class param_type
    {
        result_type __mean_;
        result_type __stddev_;
    public:
        typedef gaussian_distribution distribution_type;
        strong_inline
        explicit param_type(result_type __mean = 0, result_type __stddev = 1)
            : __mean_(__mean), __stddev_(__stddev) {}
        strong_inline
        result_type mean() const {return __mean_;}
        strong_inline
        result_type stddev() const {return __stddev_;}
        friend strong_inline
            bool operator==(const param_type& __x, const param_type& __y)
            {return __x.__mean_ == __y.__mean_ && __x.__stddev_ == __y.__stddev_;}
        friend strong_inline
            bool operator!=(const param_type& __x, const param_type& __y)
            {return !(__x == __y);}
    };
 private:
    param_type __p_;
    result_type _V_;
    bool _V_hot_;
 public:
    // constructors and reset functions
    strong_inline
    explicit gaussian_distribution(result_type __mean = 0, result_type __stddev = 1)
        : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
    strong_inline
    explicit gaussian_distribution(const param_type& __p)
        : __p_(__p), _V_hot_(false) {}
    strong_inline
    void reset() {_V_hot_ = false;}
    // generating functions
    template<class _URNG>
        strong_inline
        result_type operator()(_URNG& __g)
        {return (*this)(__g, __p_);}
    template<class _URNG> result_type operator()(_URNG& __g, const param_type& __p);
    // property functions
    strong_inline
    result_type mean() const {return __p_.mean();}
    strong_inline
    result_type stddev() const {return __p_.stddev();}
    strong_inline
    param_type param() const {return __p_;}
    strong_inline
    void param(const param_type& __p) {__p_ = __p;}
    strong_inline
    result_type min() const {return -std::numeric_limits<result_type>::infinity();}
    strong_inline
    result_type max() const {return std::numeric_limits<result_type>::infinity();}
    friend strong_inline
        bool operator==(const gaussian_distribution& __x,
                        const gaussian_distribution& __y)
        {return __x.__p_ == __y.__p_ && __x._V_hot_ == __y._V_hot_ &&
                (!__x._V_hot_ || __x._V_ == __y._V_);}
    friend strong_inline
        bool operator!=(const gaussian_distribution& __x,
                        const gaussian_distribution& __y)
        {return !(__x == __y);}
    template <class _CharT, class _Traits, class _RT>
    friend
    std::basic_ostream<_CharT, _Traits>&
    operator<<(std::basic_ostream<_CharT, _Traits>& __os,
               const gaussian_distribution<_RT>& __x);
    template <class _CharT, class _Traits, class _RT>
    friend
    std::basic_istream<_CharT, _Traits>&
    operator>>(std::basic_istream<_CharT, _Traits>& __is,
               gaussian_distribution<_RT>& __x);
 };
 template <class _RealType>
 template<class _URNG>
 _RealType
 gaussian_distribution<_RealType>::operator()(_URNG& __g, const param_type& __p)
 {
    result_type _Up;
    if (_V_hot_)
    {
        _V_hot_ = false;
        _Up = _V_;
    }
    else
    {
        std::uniform_real_distribution<result_type> _Uni(-1, 1);
        result_type __u;
        result_type __v;
        result_type __s;
        do
        {
            __u = _Uni(__g);
            __v = _Uni(__g);
            __s = __u * __u + __v * __v;
        } while (__s > 1 || __s == 0);
        result_type _Fp = std::sqrt(-2 * std::log(__s) / __s);
        _V_ = __v * _Fp;
        _V_hot_ = true;
        _Up = __u * _Fp;
    }
    return _Up * __p.stddev() + __p.mean();
 }
 template <class _CharT, class _Traits, class _RT>
 std::basic_ostream<_CharT, _Traits>&
 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
           const gaussian_distribution<_RT>& __x)
 {
    auto __save_flags = __os.flags();
    __os.flags(std::ios_base::dec | std::ios_base::left | std::ios_base::fixed |
               std::ios_base::scientific);
    _CharT __sp = __os.widen(' ');
    __os.fill(__sp);
    __os << __x.mean() << __sp << __x.stddev() << __sp << __x._V_hot_;
    if (__x._V_hot_)
        __os << __sp << __x._V_;
    __os.flags(__save_flags);
    return __os;
 }
 template <class _CharT, class _Traits, class _RT>
 std::basic_istream<_CharT, _Traits>&
 operator>>(std::basic_istream<_CharT, _Traits>& __is,
           gaussian_distribution<_RT>& __x)
 {
    typedef gaussian_distribution<_RT> _Eng;
    typedef typename _Eng::result_type result_type;
    typedef typename _Eng::param_type param_type;
    auto __save_flags = __is.flags();
    __is.flags(std::ios_base::dec | std::ios_base::skipws);
    result_type __mean;
    result_type __stddev;
    result_type _Vp = 0;
    bool _V_hot = false;
    __is >> __mean >> __stddev >> _V_hot;
    if (_V_hot)
        __is >> _Vp;
    if (!__is.fail())
    {
        __x.param(param_type(__mean, __stddev));
        __x._V_hot_ = _V_hot;
        __x._V_ = _Vp;
    }
    __is.flags(__save_flags);
    return __is;
 }
 }
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -263,7 +263,8 @@ public:
  int face_table_computed;
  std::vector<commVector<std::pair<int,int> > > face_table ;
  Vector<int> surface_list;
-
+  bool locally_periodic;
  stencilVector<StencilEntry>  _entries; // Resident in managed memory
  commVector<StencilEntry>     _entries_device; // Resident in managed memory
  std::vector<Packet> Packets;
@ -320,7 +321,7 @@ public:
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
    int simd_layout     = _grid->_simd_layout[dimension];
-    int comm_dim        = _grid->_processors[dimension] >1 ;
+    int comm_dim        = _grid->_processors[dimension] >1 && (!locally_periodic);
    int recv_from_rank;
    int xmit_to_rank;
@ -328,6 +329,7 @@ public:
    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
    return 0;
  }
  //////////////////////////////////////////
@ -473,7 +475,7 @@ public:
    // the permute type
    int simd_layout     = _grid->_simd_layout[dimension];
-    int comm_dim        = _grid->_processors[dimension] >1 ;
+    int comm_dim        = _grid->_processors[dimension] >1 && (!locally_periodic);
    int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
    int is_same_node = 1;
@ -657,6 +659,20 @@ public:
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
 		   Parameters p)
    : CartesianStencil(grid,
 		       npoints,
 		       checkerboard,
 		       directions,
 		       distances,
 		       false,
 		       p){};
  CartesianStencil(GridBase *grid,
 		   int npoints,
 		   int checkerboard,
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
 		   bool _locally_periodic,
 		   Parameters p)
    : shm_bytes_thr(npoints),
      comm_bytes_thr(npoints),
      comm_enter_thr(npoints),
@ -665,6 +681,7 @@ public:
  {
    face_table_computed=0;
    _grid    = grid;
    this->locally_periodic=_locally_periodic;
    this->parameters=p;
    /////////////////////////////////////
    // Initialise the base
@ -690,6 +707,8 @@ public:
      int point = i;
      int dimension    = directions[i];
      assert(dimension>=0 && dimension<_grid->Nd());
      int displacement = distances[i];
      int shift = displacement;
@ -703,7 +722,7 @@ public:
      // the permute type
      //////////////////////////
      int simd_layout     = _grid->_simd_layout[dimension];
-      int comm_dim        = _grid->_processors[dimension] >1 ;
+      int comm_dim        = _grid->_processors[dimension] >1 && (!locally_periodic);
      int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
      int rotate_dim      = _grid->_simd_layout[dimension]>2;
@ -817,7 +836,7 @@ public:
    int pd              = _grid->_processors[dimension];
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
-
+    assert(locally_periodic==false);
    assert(comm_dim==1);
    int shift = (shiftpm + fd) %fd;
    assert(shift>=0);
@ -997,6 +1016,7 @@ public:
    int pd              = _grid->_processors[dimension];
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
    assert(locally_periodic==false);
    assert(simd_layout==1);
    assert(comm_dim==1);
    assert(shift>=0);
@ -1089,6 +1109,7 @@ public:
    int pd              = _grid->_processors[dimension];
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
    assert(locally_periodic==false);
    assert(comm_dim==1);
    // This will not work with a rotate dim
    assert(simd_layout==maxl);
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@ -52,12 +52,17 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
  return ret;
 }
 // Specialisation: Cayley-Hamilton exponential for SU(3)
-#ifndef GRID_CUDA
+#if 0
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
-accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
+accelerator_inline iMatrix<vtype,3> Exponentiated(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
 {
  return ExponentiateCayleyHamilton(arg,alpha);
 }
 #endif
 template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
 accelerator_inline iMatrix<vtype,3> ExponentiateCayleyHamilton(const iMatrix<vtype,3> &arg, RealD alpha )
 {
  // for SU(3) 2x faster than the std implementation using Nexp=12
  // notice that it actually computes
@ -115,8 +120,6 @@ accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, Re
  return (f0 * unit + timesMinusI(f1) * arg*alpha - f2 * iQ2);
 }
 #endif
 // General exponential
 template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
@ -129,8 +132,8 @@ accelerator_inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, Re
  typedef iMatrix<vtype,N> mat;
  mat unit(1.0);
  mat temp(unit);
-  for(int i=Nexp; i>=1;--i){
+  for(int n=Nexp; n>=1;--n){
-    temp *= alpha/RealD(i);
+    temp *= alpha/RealD(n);
    temp = unit + temp*arg;
  }
  return temp;
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@ -208,5 +208,46 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 }
 //////////////////////////////////////////////////////////////////////////////////
 //Copy a single lane of a SIMD tensor type from one object to another
 //Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
 ///////////////////////////////////////////////////////////////////////////////////
 template<class vobjOut, class vobjIn>
 accelerator_inline 
 void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
 {
  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  typedef typename vobjOut::vector_type ovector_type;  
  typedef typename vobjIn::vector_type ivector_type;  
  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
  typedef typename vobjOut::scalar_type oscalar_type;  
  typedef typename vobjIn::scalar_type iscalar_type;  
  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
  typedef oextract_type * opointer;
  typedef iextract_type * ipointer;
  constexpr int oNsimd=ovector_type::Nsimd();
  constexpr int iNsimd=ivector_type::Nsimd();
  iscalar_type itmp;
  oscalar_type otmp;
  opointer __restrict__  op = (opointer)&vecOut;
  ipointer __restrict__  ip = (ipointer)&vecIn;
  for(int w=0;w<owords;w++){
    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
    otmp = itmp; //potential precision change
    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
  }
 }
 NAMESPACE_END(Grid);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -133,11 +133,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
    std::cout << "========================== CUDA KERNEL CALL\n";	\
    cuda_mem();								\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
    cuda_mem();								\
    std::cout << "========================== CUDA KERNEL DONE\n";	\
  }
 #define accelerator_for6dNB(iter1, num1,				\
@ -209,7 +205,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %lu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
@ -219,7 +215,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %lu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@ -0,0 +1,473 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/DWF2p1fIwasakiGparity.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //2+1f DWF+I ensemble with G-parity BCs
 //designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  RatQuoParameters, rat_quo_l,
 				  RatQuoParameters, rat_quo_s);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr){
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
 }
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "Params", user_params);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
   // Typedefs to simplify notation
  typedef GparityDomainWallFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityDomainWallFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.032;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  //Setup the Grids
  auto GridPtrD   = TheHMC.Resources.GetCartesian();
  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(GridPtrD);
  LatticeGaugeFieldF Uf(GridPtrF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light action
  /////////////////////////////////////////////////////////////
  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_l;
  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_l.precision= 60;
  rat_act_params_l.MaxIter  = 10000;
  user_params.rat_quo_l.Export(rat_act_params_l);
  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
  //DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
  Level1.push_back(&Quotient_l);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  //DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
  std::string lanc_params_l, lanc_params_s;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
    else if(sarg == "--eigenrange_l"){
      assert(i < argc-1);
      eigenrange_l=true;
      lanc_params_l = argv[i+1];
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
  }
  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
+++ b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
@ -0,0 +1,473 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./HMC/DWF2p1fIwasakiGparity.cc
 Copyright (C) 2015-2016
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 //2+1f DWF+I ensemble with G-parity BCs
 //designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
 struct RatQuoParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
 				  double, bnd_lo,
 				  double, bnd_hi,
 				  Integer, action_degree,
 				  double, action_tolerance,
 				  Integer, md_degree,
 				  double, md_tolerance,
 				  Integer, reliable_update_freq,
 				  Integer, bnd_check_freq);
  RatQuoParameters() { 
    bnd_lo = 1e-2;
    bnd_hi = 30;
    action_degree = 10;
    action_tolerance = 1e-10;
    md_degree = 10;
    md_tolerance = 1e-8;
    bnd_check_freq = 20;
    reliable_update_freq = 50;
  }
  void Export(RationalActionParams &into) const{
    into.lo = bnd_lo;
    into.hi = bnd_hi;
    into.action_degree = action_degree;
    into.action_tolerance = action_tolerance;
    into.md_degree = md_degree;
    into.md_tolerance = md_tolerance;
    into.BoundsCheckFreq = bnd_check_freq;
  }
 };
 struct EvolParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
                                  Integer, StartTrajectory,
                                  Integer, Trajectories,
 				  Integer, SaveInterval,
 				  Integer, Steps,
                                  bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
 				  RatQuoParameters, rat_quo_l,
 				  RatQuoParameters, rat_quo_s);
  EvolParameters() {
    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
    MetropolisTest    = false;
    StartTrajectory   = 0;
    Trajectories      = 50;
    SaveInterval = 5;
    StartingType      = "ColdStart";
    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
    Steps = 5;
  }
 };
 bool fileExists(const std::string &fn){
  std::ifstream f(fn);
  return f.good();
 }
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 				  double, alpha,
 				  double, beta,
 				  double, mu,
 				  int, ord,
 				  int, n_stop,
 				  int, n_want,
 				  int, n_use,
 				  double, tolerance);
  LanczosParameters() {
    alpha = 35;
    beta = 5;
    mu = 0;
    ord = 100;
    n_stop = 10;
    n_want = 10;
    n_use = 15;
    tolerance = 1e-6;
  }
 };
 template<typename FermionActionD, typename FermionFieldD>
 void computeEigenvalues(std::string param_file,
 			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 			FermionActionD &action, GridParallelRNG &rng){
  LanczosParameters params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "LanczosParameters", params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "LanczosParameters", params);
  }
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  action.ImportGauge(latt);
  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
  assert(params.mu == 0.0);
  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
  std::vector<RealD> eval(params.n_use);
  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
  int Nconv;
  IRL.calc(eval, evec, gauss_o, Nconv);
  std::cout << "Eigenvalues:" << std::endl;
  for(int i=0;i<params.n_want;i++){
    std::cout << i << " " << eval[i] << std::endl;
  }
 }
 //Check the quality of the RHMC approx
 template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr){
  FermionFieldD gauss_o(rbGrid);
  FermionFieldD gauss(Grid);
  gaussian(rng, gauss);
  pickCheckerboard(Odd, gauss_o, gauss);
  numOp.ImportGauge(latt);
  denOp.ImportGauge(latt);
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "-------------------------------------------------------------------------------" << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
 }
 int main(int argc, char **argv) {
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::string param_file = "params.xml";
  bool file_load_check = false;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--param_file"){
      assert(i!=argc-1);
      param_file = argv[i+1];
    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
      file_load_check = true;
    }
  }
  //Read the user parameters
  EvolParameters user_params;
  if(fileExists(param_file)){
    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
    Grid::XmlReader rd(param_file);
    read(rd, "Params", user_params);
  }else if(!GlobalSharedMemory::WorldRank){
    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
    Grid::XmlWriter wr(param_file + ".templ");
    write(wr, "Params", user_params);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Check the parameters
  if(user_params.GparityDirs.size() != Nd-1){
    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
    exit(1);
  }
  for(int i=0;i<Nd-1;i++)
    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
      exit(1);
    }
   // Typedefs to simplify notation
  typedef GparityDomainWallFermionD FermionActionD;
  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef GparityDomainWallFermionF FermionActionF;
  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = user_params.Steps;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = user_params.StartTrajectory;
  HMCparams.Trajectories     = user_params.Trajectories;
  HMCparams.NoMetropolisUntil= 0;
  HMCparams.StartingType     = user_params.StartingType;
  HMCparams.MetropolisTest = user_params.MetropolisTest;
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_lat";
  CPparams.rng_prefix    = "ckpoint_rng";
  CPparams.saveInterval  = user_params.SaveInterval;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.032;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  //Setup the Grids
  auto GridPtrD   = TheHMC.Resources.GetCartesian();
  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  ConjugateIwasakiGaugeActionD GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeFieldD Ud(GridPtrD);
  LatticeGaugeFieldF Uf(GridPtrF);
  //Setup the BCs
  FermionActionD::ImplParams Params;
  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
  Params.twists[Nd-1] = 1; //APBC in time direction
  std::vector<int> dirs4(Nd);
  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
  dirs4[Nd-1] = 0; //periodic gauge BC in time
  GaugeImplPolicy::setDirections(dirs4); //gauge BC
  //Run optional gauge field checksum checker and exit
  if(file_load_check){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
  /////////////////////////////////////////////////////////////
  // Light action
  /////////////////////////////////////////////////////////////
  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_l;
  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
  rat_act_params_l.precision= 60;
  rat_act_params_l.MaxIter  = 10000;
  user_params.rat_quo_l.Export(rat_act_params_l);
  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
  DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
  Level1.push_back(&Quotient_l);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
  RationalActionParams rat_act_params_s;
  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
  rat_act_params_s.precision= 60;
  rat_act_params_s.MaxIter  = 10000;
  user_params.rat_quo_s.Export(rat_act_params_s);
  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
  Level1.push_back(&Quotient_s);  
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  //Action tuning
  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
  std::string lanc_params_l, lanc_params_s;
  for(int i=1;i<argc;i++){
    std::string sarg(argv[i]);
    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
    else if(sarg == "--eigenrange_l"){
      assert(i < argc-1);
      eigenrange_l=true;
      lanc_params_l = argv[i+1];
    }
    else if(sarg == "--eigenrange_s"){
      assert(i < argc-1);
      eigenrange_s=true;
      lanc_params_s = argv[i+1];
    }
  }
  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
    TheHMC.initializeGaugeFieldAndRNGs(Ud);
    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
    std::cout << GridLogMessage << " Done" << std::endl;
    Grid_finalize();
    return 0;
  }
  //Run the HMC
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();
  std::cout << GridLogMessage << " Done" << std::endl;
  Grid_finalize();
  return 0;
 } // main
--- a/HMC/Mobius2f.cc
+++ b/HMC/Mobius2f.cc
@ -0,0 +1,170 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 12;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 17;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_2fDWF_lat";
  CPparams.rng_prefix    = "ckpoint_2fDWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0;
  RealD c   = 0.0;
  std::vector<Real> hasenbusch({ 0.1, 0.4, 0.7 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2f_DDHMC_mixed.cc
+++ b/HMC/Mobius2f_DDHMC_mixed.cc
@ -0,0 +1,386 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 nnSource file: 
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 class DomainLocalTwoFlavourEvenOddRatioPseudoFermionAction
  : public TwoFlavourEvenOddRatioPseudoFermionAction<Impl>
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  Coordinate Block;
  DomainDecomposition Domains;
  DomainLocalTwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						       FermionOperator<Impl>  &_DenOp, 
 						       OperatorFunction<FermionField> & DS,
 						       OperatorFunction<FermionField> & AS,
 						       OperatorFunction<FermionField> & HS,
 						       Coordinate &_Block ) :
    Block(_Block),
    Domains(_Block),
    TwoFlavourEvenOddRatioPseudoFermionAction<Impl>(_NumOp,_DenOp,DS,AS,HS)
    {};
  virtual void refreshRestrict(FermionField &eta)
  {
    Domains.ProjectDomain(eta,0);
    DumpSliceNorm("refresh Restrict eta",eta);
  };
 };
 #define MIXED_PRECISION
 NAMESPACE_END(Grid);
 int main(int argc, char **argv)
 {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FimplD;
  typedef WilsonImplF FimplF;
  typedef FermionOperator<FimplF> FermionOperatorF;
  typedef FermionOperator<FimplD> FermionOperatorD;
  typedef MobiusFermionR FermionActionD;
  typedef MobiusFermionF FermionActionF;
  typedef DirichletFermionOperator<WilsonImplR> DirichletFermionD;
  typedef DirichletFermionOperator<WilsonImplF> DirichletFermionF;
  typedef MobiusEOFAFermionR FermionEOFAAction;
  typedef typename FermionActionD::FermionField FermionFieldD;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef SchurDiagMooeeOperator<FermionOperator<FimplF>,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionOperator<FimplD>,FermionFieldD> LinearOperatorD;
  typedef SchurDiagMooeeDagOperator<FermionOperator<FimplF>,FermionFieldF> LinearOperatorDagF;
  typedef SchurDiagMooeeDagOperator<FermionOperator<FimplD>,FermionFieldD> LinearOperatorDagD;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
  /*
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 4; // dH = 0.08
  //  MD.MDsteps = 3; // dH = 0.8
  MD.trajL   = 1.0;
  */
  HMCparameters HMCparams;
  {
    XmlReader  HMCrd("HMCparameters.xml");
    read(HMCrd,"HMCparameters",HMCparams);
    std::cout << GridLogMessage<< HMCparams <<std::endl;
  }
  HMCWrapper TheHMC(HMCparams);
  /*
  HMCparams.StartTrajectory  = 66;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  // HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  */
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_DDHMC_lat";
  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Momentum Dirichlet
  Coordinate Block({0,0,0,24});
  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block));
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  //  Real light_mass   = 0.04;
  Real light_mass   = 0.01;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; 
  RealD c   = 0.0;
  std::vector<Real> hasenbusch({ 0.1, 0.4, 0.7 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  Coordinate latt  = GridDefaultLatt();
  Coordinate mpi   = GridDefaultMpi();
  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
  Coordinate simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeFieldF UF(GridPtrF);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionActionD::ImplParams Params(boundary);
  FermionActionD::ImplParams DirichletParams(boundary);
  DirichletParams.locally_periodic=true;
  double ActionStoppingCondition     = 1e-10;
  double DerivativeStoppingCondition = 1e-10;
  //  double BoundaryDerivativeStoppingCondition = 1e-10; decent acceptance
  double BoundaryDerivativeStoppingCondition = 1e-7;   // decent acceptance
  //  double BoundaryDerivativeStoppingCondition = 1e-6;  // bit bigger not huge
  //  double BoundaryDerivativeStoppingCondition = 1e-5; // Large dH poor acceptance
  double MaxCGIterations = 30000;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(3);
  ActionLevel<HMCWrapper::Field> Level3(8);
  ConjugateGradient<FermionFieldD>      ActionCG(ActionStoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionFieldD>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  //////////////////////////////////////////////////////////////
  // Forced to replicate the MxPCG and DenominatorsF etc.. because
  // there is no convenient way to "Clone" physics params from double op
  // into single op for any operator pair.
  // Same issue prevents using MxPCG in the Heatbath step
  //////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////
  // These are consumed/owned by the Dirichlet wrappers
  /////////////////////////////////////////////////
  std::vector<FermionActionD *> DNumeratorsD;
  std::vector<FermionActionF *> DNumeratorsF;
  std::vector<FermionActionD *> DDenominatorsD;
  std::vector<FermionActionF *> DDenominatorsF;
  /////////////////////////////////////////////////
  // Dirichlet wrappers
  /////////////////////////////////////////////////
  std::vector<DirichletFermionD *> DirichletNumeratorsD;
  std::vector<DirichletFermionF *> DirichletNumeratorsF;
  std::vector<DirichletFermionD *> DirichletDenominatorsD;
  std::vector<DirichletFermionF *> DirichletDenominatorsF;
  std::vector<DomainLocalTwoFlavourEvenOddRatioPseudoFermionAction<FimplD> *> Quotients;
  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperatorD,
 							  FermionOperatorF,
 							  LinearOperatorD,
 							  LinearOperatorF> MxPCG;
  std::vector<MxPCG *> ActionMPCG;
  std::vector<MxPCG *> MPCG;
  std::vector<LinearOperatorD *> LinOpD;
  std::vector<LinearOperatorF *> LinOpF; 
  int MX_inner = 1000;
  RealD MX_tol = 1.0e-5;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    DNumeratorsD.push_back (new FermionActionD(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, DirichletParams));
    DNumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, DirichletParams));
    DDenominatorsD.push_back(new FermionActionD(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, DirichletParams));
    DDenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, DirichletParams));
    DirichletNumeratorsD.push_back  (new  DirichletFermionD(*DNumeratorsD[h],Block));
    DirichletNumeratorsF.push_back  (new  DirichletFermionF(*DNumeratorsF[h],Block));
    DirichletDenominatorsD.push_back(new  DirichletFermionD(*DDenominatorsD[h],Block));
    DirichletDenominatorsF.push_back(new  DirichletFermionF(*DDenominatorsF[h],Block));
    // Dirichlet Schur even odd MpsDagMpc operators on local domains
    LinOpD.push_back(new LinearOperatorD(*DirichletDenominatorsD[h]));
    LinOpF.push_back(new LinearOperatorF(*DirichletDenominatorsF[h]));
    // Derivative
    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,MX_tol,
 			     MX_inner,
 			     MaxCGIterations,
 			     FrbGridF,
 			     *DirichletDenominatorsF[h],*DirichletDenominatorsD[h],
 			     *LinOpF[h], *LinOpD[h]) );
    // Action
    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,MX_tol,
 				   MX_inner,
 				   MaxCGIterations,
 				   FrbGridF,
 				   *DirichletDenominatorsF[h],*DirichletDenominatorsD[h],
 				   *LinOpF[h], *LinOpD[h]) );
    ////////////////////////////////////////////////////////////////////////////
    // Standard CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    Quotients.push_back   (new
 			   DomainLocalTwoFlavourEvenOddRatioPseudoFermionAction<FimplD>
 			   (*DirichletNumeratorsD[h],
 			    *DirichletDenominatorsD[h],
 			    *MPCG[h],
 			    *ActionMPCG[h],
 			    ActionCG,Block));
    Level2.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Boundary action
  /////////////////////////////////////////////////////////////
  int l_idx = 0;
  int pv_idx = n_hasenbusch;
  RealD h_mass = 0.012;
  std::cout << GridLogMessage<<" Boundary action masses " <<light_num[l_idx]<<" / "<<light_den[pv_idx]<<std::endl;
  // OmegaBar cross domain boundary and is used in Boundary operator, so no locally_periodic hack in the boundary det
  // Dirichlet is applied in gauge link only. OmegaBar solve is too expensive. Monitor cost.
  FermionActionD    PeriNumeratorD  (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[pv_idx],M5,b,c, Params);
  FermionActionF    PeriNumeratorF  (UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[pv_idx],M5,b,c, Params);
  FermionActionD    DirichletNumeratorDD(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[pv_idx],M5,b,c, Params);
  FermionActionF    DirichletNumeratorFF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[pv_idx],M5,b,c, Params);
  DirichletFermionD DirichletNumeratorD  (DirichletNumeratorDD,Block);
  DirichletFermionF DirichletNumeratorF  (DirichletNumeratorFF,Block);
  FermionActionD    PeriDenominatorD(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[l_idx] ,M5,b,c, Params);
  FermionActionF    PeriDenominatorF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[l_idx] ,M5,b,c, Params);
  FermionActionD    DirichletDenominatorDD(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[l_idx] ,M5,b,c, Params);
  FermionActionF    DirichletDenominatorFF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[l_idx] ,M5,b,c, Params);
  DirichletFermionD DirichletDenominatorD(DirichletDenominatorDD,Block);
  DirichletFermionF DirichletDenominatorF(DirichletDenominatorFF,Block);
  FermionActionD    PeriHasenD  (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,h_mass ,M5,b,c, Params);
  FermionActionF    PeriHasenF  (UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,h_mass,M5,b,c, Params);
  FermionActionD    DHasenD(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,h_mass,M5,b,c, Params);
  FermionActionF    DHasenF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,h_mass,M5,b,c, Params);
  DirichletFermionD DirichletHasenD(DHasenD,Block);
  DirichletFermionF DirichletHasenF(DHasenF,Block);
  SchurFactoredFermionOperator<FimplD,FimplF> BoundaryNumerator(PeriNumeratorD,PeriNumeratorF,
 								DirichletNumeratorD,DirichletNumeratorF,
 								Block);
  SchurFactoredFermionOperator<FimplD,FimplF> BoundaryDenominator(PeriDenominatorD,PeriDenominatorF,
 								  DirichletDenominatorD,DirichletDenominatorF,
 								  Block);
  SchurFactoredFermionOperator<FimplD,FimplF> BoundaryHasen(PeriHasenD,PeriHasenF,
 							    DirichletHasenD,DirichletHasenF,
 							    Block);
 #if 1
  std::cout << GridLogMessage << " Boundary NO ratio "<< std::endl;
  MX_tol = 1.0e-5;
  Level1.push_back(new
 		   DomainDecomposedBoundaryTwoFlavourPseudoFermion<FimplD,FimplF>
 		   (BoundaryDenominator,
 		    BoundaryDerivativeStoppingCondition,ActionStoppingCondition,MX_tol));
  Level1.push_back(new
 		   DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion<FimplD,FimplF>
 		   (BoundaryNumerator,
 		    BoundaryDerivativeStoppingCondition,ActionStoppingCondition,MX_tol));
 #else
  Level1.push_back(new
 		   DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion<FimplD,FimplF>
 		   (BoundaryNumerator,
 		    BoundaryDenominator,
 		    BoundaryDerivativeStoppingCondition,ActionStoppingCondition));
 #endif
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@ -33,137 +33,8 @@ directory
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
 #endif
 #include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
 NAMESPACE_BEGIN(Grid);
  /*
   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
   *    -- Store the single prec action operator.
   *    -- Clone the gauge field from the operator function argument.
   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
   */
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
      /* Debugging instances of objects; references are stored
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
      */
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
      // Assumption made in code to extract gauge field
      // We could avoid storing LinopD reference alltogether ?
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      ////////////////////////////////////////////////////////////////////////////////////
      // Must snarf a single precision copy of the gauge field in Linop_d argument
      ////////////////////////////////////////////////////////////////////////////////////
      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
      GridBase * GridPtrF = SinglePrecGrid4;
      GridBase * GridPtrD = FermOpD.Umu.Grid();
      GaugeFieldF     U_f  (GridPtrF);
      GaugeLinkFieldF Umu_f(GridPtrF);
      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
      ////////////////////////////////////////////////////////////////////////////////////
      // Moving this to a Clone method of fermion operator would allow to duplicate the 
      // physics parameters and decrease gauge field copies
      ////////////////////////////////////////////////////////////////////////////////////
      GaugeLinkFieldD Umu_d(GridPtrD);
      for(int mu=0;mu<Nd*2;mu++){ 
 	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
 	precisionChange(Umu_f,Umu_d);
 	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
      }
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Could test to make sure that LinOpF and LinOpD agree to single prec?
      ////////////////////////////////////////////////////////////////////////////////////
      /*
      GridBase *Fgrid = psi._grid;
      FieldD tmp2(Fgrid);
      FieldD tmp1(Fgrid);
      LinOpU.Op(src,tmp1);
      LinOpD.Op(src,tmp2);
      std::cout << " Double gauge field "<< norm2(FermOpD.Umu)<<std::endl;
      std::cout << " Single gauge field "<< norm2(FermOpF.Umu)<<std::endl;
      std::cout << " Test of operators "<<norm2(tmp1)<<std::endl;
      std::cout << " Test of operators "<<norm2(tmp2)<<std::endl;
      tmp1=tmp1-tmp2;
      std::cout << " Test of operators diff "<<norm2(tmp1)<<std::endl;
      */
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;
@ -190,18 +61,18 @@ int main(int argc, char **argv) {
  //  MD.name    = std::string("Leap Frog");
  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  MD.name    = std::string("Force Gradient");
-  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
-  //  MD.name    = std::string("MinimumNorm2");
+  //MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 6;
+  MD.MDsteps = 15;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 590;
+  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 1000;
-  HMCparams.NoMetropolisUntil=  0;
+  HMCparams.NoMetropolisUntil=  10;
  //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("ColdStart");
-  HMCparams.StartingType     =std::string("CheckpointStart");
+  //HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
@ -209,9 +80,9 @@ int main(int argc, char **argv) {
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.config_prefix = "ckpoint_EOFA_lat";
-  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
+  CPparams.rng_prefix    = "ckpoint_EOFA_rng";
-  CPparams.saveInterval  = 10;
+  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@ -226,16 +97,16 @@ int main(int argc, char **argv) {
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
-  const int Ls      = 16;
+  const int Ls      = 24;
  Real beta         = 2.13;
-  Real light_mass   = 0.01;
+  Real light_mass   = 0.005;
-  Real strange_mass = 0.04;
+  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
-  RealD b   = 1.0; 
+  RealD b   = 1.5; 
-  RealD c   = 0.0;
+  RealD c   = 0.5;
-  std::vector<Real> hasenbusch({ 0.1, 0.3, 0.6 });
+  std::vector<Real> hasenbusch({ 0.02, 0.2, 0.6 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
@ -263,7 +134,7 @@ int main(int argc, char **argv) {
  FermionActionF::ImplParams ParamsF(boundary);
  double ActionStoppingCondition     = 1e-10;
-  double DerivativeStoppingCondition = 1e-6;
+  double DerivativeStoppingCondition = 1e-8;
  double MaxCGIterations = 30000;
  ////////////////////////////////////
@ -302,40 +173,37 @@ int main(int argc, char **argv) {
  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
 #ifdef MIXED_PRECISION
  const int MX_inner = 1000;
  const RealD MX_tol = 1.0e-6;
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
-  MxPCG_EOFA ActionCGL(ActionStoppingCondition,
+  MxPCG_EOFA ActionCGL(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       GridPtrF,
 		       FrbGridF,
 		       Strange_Op_LF,Strange_Op_L,
 		       Strange_LinOp_LF,Strange_LinOp_L);
-  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
+  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   GridPtrF,
 			   FrbGridF,
 			   Strange_Op_LF,Strange_Op_L,
 			   Strange_LinOp_LF,Strange_LinOp_L);
-  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
+  MxPCG_EOFA ActionCGR(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       GridPtrF,
 		       FrbGridF,
 		       Strange_Op_RF,Strange_Op_R,
 		       Strange_LinOp_RF,Strange_LinOp_R);
-  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
+  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   GridPtrF,
 			   FrbGridF,
 			   Strange_Op_RF,Strange_Op_R,
 			   Strange_LinOp_RF,Strange_LinOp_R);
@ -401,18 +269,16 @@ int main(int argc, char **argv) {
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
-    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,
+    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,MX_tol,
 			     MX_inner,
 			     MaxCGIterations,
 			     GridPtrF,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
-    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
+    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,MX_tol,
 				   MX_inner,
 				   MaxCGIterations,
 				   GridPtrF,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
--- a/HMC/Mobius2p1fEOFA_4dPseudoFermion.cc
+++ b/HMC/Mobius2p1fEOFA_4dPseudoFermion.cc
@ -0,0 +1,338 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: 
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu
 Author: David Murphy
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/action/pseudofermion/TwoFlavourRatioEO4DPseudoFermion.h>
 #include <Grid/qcd/action/pseudofermion/TwoFlavourRatio4DPseudoFermion.h>
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
 #endif
 #include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef MobiusFermionF FermionActionF;
  typedef MobiusEOFAFermionR FermionEOFAAction;
  typedef MobiusEOFAFermionF FermionEOFAActionF;
  typedef typename FermionAction::FermionField FermionField;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 12;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 211;
  HMCparams.Trajectories     = 1000;
  HMCparams.NoMetropolisUntil=  0;
  //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EOFA4D_lat";
  CPparams.rng_prefix    = "ckpoint_EOFA4D_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; 
  RealD c   = 0.0;
  std::vector<Real> hasenbusch({ 0.1, 0.3, 0.6 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  Coordinate latt  = GridDefaultLatt();
  Coordinate mpi   = GridDefaultMpi();
  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
  Coordinate simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeFieldF UF(GridPtrF);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionActionF::ImplParams ParamsF(boundary);
  double ActionStoppingCondition     = 1e-10;
  double DerivativeStoppingCondition = 1e-8;
  double MaxCGIterations = 30000;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeDagOperator<FermionActionF,FermionFieldF> LinearOperatorDagF;
  typedef SchurDiagMooeeDagOperator<FermionAction ,FermionField > LinearOperatorDagD;
  typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorDagD,LinearOperatorDagF> MxDagPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
  // DJM: setup for EOFA ratio (Mobius)
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 0.1;
  OFRp.hi       = 25.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-9;
  OFRp.degree   = 14;
  OFRp.precision= 50;
  MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
 #ifdef MIXED_PRECISION
  const int MX_inner = 1000;
  const RealD MX_tol = 1.0e-4;
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
  MxPCG_EOFA ActionCGL(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       FrbGridF,
 		       Strange_Op_LF,Strange_Op_L,
 		       Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   FrbGridF,
 			   Strange_Op_LF,Strange_Op_L,
 			   Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA ActionCGR(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       FrbGridF,
 		       Strange_Op_RF,Strange_Op_R,
 		       Strange_LinOp_RF,Strange_LinOp_R);
  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   FrbGridF,
 			   Strange_Op_RF,Strange_Op_R,
 			   Strange_LinOp_RF,Strange_LinOp_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCGL, ActionCGR,
 	 DerivativeCGL, DerivativeCGR,
 	 OFRp, true);
 #else
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG,
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG, 
 	 OFRp, true);
 #endif
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  //////////////////////////////////////////////////////////////
  // Forced to replicate the MxPCG and DenominatorsF etc.. because
  // there is no convenient way to "Clone" physics params from double op
  // into single op for any operator pair.
  // Same issue prevents using MxPCG in the Heatbath step
  //////////////////////////////////////////////////////////////
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourRatioEO4DPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<MxPCG *> ActionMPCG;
  std::vector<MxPCG *> MPCG;
  std::vector<MxDagPCG *> MPCGdag;
  std::vector<FermionActionF *> DenominatorsF;
  std::vector<LinearOperatorD *> LinOpD;
  std::vector<LinearOperatorF *> LinOpF; 
  std::vector<LinearOperatorDagD *> LinOpDagD;
  std::vector<LinearOperatorDagF *> LinOpDagF; 
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
 #ifdef MIXED_PRECISION
    ////////////////////////////////////////////////////////////////////////////
    // Mixed precision CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
    LinOpDagD.push_back(new LinearOperatorDagD(*Denominators[h]));
    LinOpDagF.push_back(new LinearOperatorDagF(*DenominatorsF[h]));
    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,MX_tol,
 			     MX_inner,
 			     MaxCGIterations,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
    MPCGdag.push_back(new MxDagPCG(DerivativeStoppingCondition,MX_tol,
 				   MX_inner,
 				   MaxCGIterations,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpDagF[h], *LinOpDagD[h]) );
    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,MX_tol,
 				   MX_inner,
 				   MaxCGIterations,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
    // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
    Quotients.push_back (new TwoFlavourRatioEO4DPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*MPCGdag[h],*ActionMPCG[h],ActionCG));
 #else
    ////////////////////////////////////////////////////////////////////////////
    // Standard CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    Quotients.push_back   (new TwoFlavourRatioEO4DPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG));
 #endif
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1fEOFA_C1M.cc
+++ b/HMC/Mobius2p1fEOFA_C1M.cc
@ -0,0 +1,312 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: 
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu
 Author: David Murphy
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
 #endif
 #include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef MobiusFermionF FermionActionF;
  typedef MobiusEOFAFermionR FermionEOFAAction;
  typedef MobiusEOFAFermionF FermionEOFAActionF;
  typedef typename FermionAction::FermionField FermionField;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  //  MD.name    = std::string("Leap Frog");
  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  HMCparameters HMCparams;
  {
    XmlReader  HMCrd("HMCparameters.xml");
    read(HMCrd,"HMCparameters",HMCparams);
    std::cout << GridLogMessage<< HMCparams <<std::endl;
  }
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_C1M_lat";
  CPparams.rng_prefix    = "ckpoint_C1M_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 24;
  Real beta         = 2.13;
  Real light_mass   = 0.005;
  Real strange_mass = 0.0362;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.5; 
  RealD c   = 0.5;
  std::vector<Real> hasenbusch({ 0.02, 0.2, 0.6 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  Coordinate latt  = GridDefaultLatt();
  Coordinate mpi   = GridDefaultMpi();
  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
  Coordinate simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeFieldF UF(GridPtrF);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionActionF::ImplParams ParamsF(boundary);
  double ActionStoppingCondition     = 1e-10;
  double DerivativeStoppingCondition = 1e-8;
  double MaxCGIterations = 30000;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
  // DJM: setup for EOFA ratio (Mobius)
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 0.1;
  OFRp.hi       = 25.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-9;
  OFRp.degree   = 14;
  OFRp.precision= 50;
  MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
 #ifdef MIXED_PRECISION
  const int MX_inner = 1000;
  const RealD MX_tol = 1.0e-6;
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
  MxPCG_EOFA ActionCGL(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       FrbGridF,
 		       Strange_Op_LF,Strange_Op_L,
 		       Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   FrbGridF,
 			   Strange_Op_LF,Strange_Op_L,
 			   Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA ActionCGR(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       FrbGridF,
 		       Strange_Op_RF,Strange_Op_R,
 		       Strange_LinOp_RF,Strange_LinOp_R);
  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   FrbGridF,
 			   Strange_Op_RF,Strange_Op_R,
 			   Strange_LinOp_RF,Strange_LinOp_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCGL, ActionCGR,
 	 DerivativeCGL, DerivativeCGR,
 	 OFRp, true);
 #else
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG,
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG, 
 	 OFRp, true);
 #endif
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  //////////////////////////////////////////////////////////////
  // Forced to replicate the MxPCG and DenominatorsF etc.. because
  // there is no convenient way to "Clone" physics params from double op
  // into single op for any operator pair.
  // Same issue prevents using MxPCG in the Heatbath step
  //////////////////////////////////////////////////////////////
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<MxPCG *> ActionMPCG;
  std::vector<MxPCG *> MPCG;
  std::vector<FermionActionF *> DenominatorsF;
  std::vector<LinearOperatorD *> LinOpD;
  std::vector<LinearOperatorF *> LinOpF; 
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
 #ifdef MIXED_PRECISION
    ////////////////////////////////////////////////////////////////////////////
    // Mixed precision CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,MX_tol,
 			     MX_inner,
 			     MaxCGIterations,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,MX_tol,
 				   MX_inner,
 				   MaxCGIterations,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
    // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
 #else
    ////////////////////////////////////////////////////////////////////////////
    // Standard CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG));
 #endif
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1fEOFA_F1.cc
+++ b/HMC/Mobius2p1fEOFA_F1.cc
@ -34,8 +34,6 @@ directory
 #define MIXED_PRECISION
 #endif
 NAMESPACE_BEGIN(Grid);
  /*
   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
   *    -- Store the single prec action operator.
@ -43,111 +41,7 @@ NAMESPACE_BEGIN(Grid);
   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
   */
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+#include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
      /* Debugging instances of objects; references are stored
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
      */
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
      // Assumption made in code to extract gauge field
      // We could avoid storing LinopD reference alltogether ?
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      ////////////////////////////////////////////////////////////////////////////////////
      // Must snarf a single precision copy of the gauge field in Linop_d argument
      ////////////////////////////////////////////////////////////////////////////////////
      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
      GridBase * GridPtrF = SinglePrecGrid4;
      GridBase * GridPtrD = FermOpD.Umu.Grid();
      GaugeFieldF     U_f  (GridPtrF);
      GaugeLinkFieldF Umu_f(GridPtrF);
      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
      ////////////////////////////////////////////////////////////////////////////////////
      // Moving this to a Clone method of fermion operator would allow to duplicate the 
      // physics parameters and decrease gauge field copies
      ////////////////////////////////////////////////////////////////////////////////////
      GaugeLinkFieldD Umu_d(GridPtrD);
      for(int mu=0;mu<Nd*2;mu++){ 
 	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
 	precisionChange(Umu_f,Umu_d);
 	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
      }
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;
@ -290,6 +184,7 @@ int main(int argc, char **argv) {
  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
 #ifdef MIXED_PRECISION
  const int MX_inner = 5000;
  const RealD MX_tol = 1.0e-6;
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
@ -297,34 +192,30 @@ int main(int argc, char **argv) {
  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
-  MxPCG_EOFA ActionCGL(ActionStoppingCondition,
+  MxPCG_EOFA ActionCGL(ActionStoppingCondition, MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       GridPtrF,
 		       FrbGridF,
 		       Strange_Op_LF,Strange_Op_L,
 		       Strange_LinOp_LF,Strange_LinOp_L);
-  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
+  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition, MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   GridPtrF,
 			   FrbGridF,
 			   Strange_Op_LF,Strange_Op_L,
 			   Strange_LinOp_LF,Strange_LinOp_L);
-  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
+  MxPCG_EOFA ActionCGR(ActionStoppingCondition, MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       GridPtrF,
 		       FrbGridF,
 		       Strange_Op_RF,Strange_Op_R,
 		       Strange_LinOp_RF,Strange_LinOp_R);
-  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
+  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition, MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   GridPtrF,
 			   FrbGridF,
 			   Strange_Op_RF,Strange_Op_R,
 			   Strange_LinOp_RF,Strange_LinOp_R);
@ -394,18 +285,16 @@ int main(int argc, char **argv) {
    double conv  = DerivativeStoppingCondition;
    if (h<3) conv= DerivativeStoppingConditionLoose; // Relax on first two hasenbusch factors
-    MPCG.push_back(new MxPCG(conv,
+    MPCG.push_back(new MxPCG(conv,MX_tol,
 			     MX_inner,
 			     MaxCGIterations,
 			     GridPtrF,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
-    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
+    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,MX_tol,
 				   MX_inner,
 				   MaxCGIterations,
 				   GridPtrF,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
--- a/HMC/Mobius2p1fEOFA_M1M.cc
+++ b/HMC/Mobius2p1fEOFA_M1M.cc
@ -0,0 +1,318 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: 
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu
 Author: David Murphy
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
 #endif
 #include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef MobiusFermionF FermionActionF;
  typedef MobiusEOFAFermionR FermionEOFAAction;
  typedef MobiusEOFAFermionF FermionEOFAActionF;
  typedef typename FermionAction::FermionField FermionField;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  //  IntegratorParameters MD;
  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  //  MD.name    = std::string("Force Gradient");
  //typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  //MD.name    = std::string("Leap Frog");
  //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
  //MD.name    = std::string("MinimumNorm2");
  //  MD.MDsteps = 15;
  //  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  {
    XmlReader  HMCrd("HMCparameters.xml");
    read(HMCrd,"HMCparameters",HMCparams);
    std::cout << GridLogMessage<< HMCparams <<std::endl;
  }
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_M1M_lat";
  CPparams.rng_prefix    = "ckpoint_M1M_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  Real beta         = 2.25;
  Real light_mass   = 0.004;
  Real strange_mass = 0.02661;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.5; 
  RealD c   = 0.5;
  std::vector<Real> hasenbusch({ 0.02, 0.2, 0.6 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  Coordinate latt  = GridDefaultLatt();
  Coordinate mpi   = GridDefaultMpi();
  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
  Coordinate simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeFieldF UF(GridPtrF);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionActionF::ImplParams ParamsF(boundary);
  double ActionStoppingCondition     = 1e-10;
  double DerivativeStoppingCondition = 1e-8;
  double MaxCGIterations = 30000;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
  // DJM: setup for EOFA ratio (Mobius)
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 0.1;
  OFRp.hi       = 25.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-9;
  OFRp.degree   = 14;
  OFRp.precision= 50;
  MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
 #ifdef MIXED_PRECISION
  const int MX_inner = 1000;
  const RealD MX_tol = 1.0e-6;
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
  MxPCG_EOFA ActionCGL(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       FrbGridF,
 		       Strange_Op_LF,Strange_Op_L,
 		       Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   FrbGridF,
 			   Strange_Op_LF,Strange_Op_L,
 			   Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA ActionCGR(ActionStoppingCondition,MX_tol,
 		       MX_inner,
 		       MaxCGIterations,
 		       FrbGridF,
 		       Strange_Op_RF,Strange_Op_R,
 		       Strange_LinOp_RF,Strange_LinOp_R);
  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,MX_tol,
 			   MX_inner,
 			   MaxCGIterations,
 			   FrbGridF,
 			   Strange_Op_RF,Strange_Op_R,
 			   Strange_LinOp_RF,Strange_LinOp_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCGL, ActionCGR,
 	 DerivativeCGL, DerivativeCGR,
 	 OFRp, true);
 #else
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG,
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG, 
 	 OFRp, true);
 #endif
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  //////////////////////////////////////////////////////////////
  // Forced to replicate the MxPCG and DenominatorsF etc.. because
  // there is no convenient way to "Clone" physics params from double op
  // into single op for any operator pair.
  // Same issue prevents using MxPCG in the Heatbath step
  //////////////////////////////////////////////////////////////
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<MxPCG *> ActionMPCG;
  std::vector<MxPCG *> MPCG;
  std::vector<FermionActionF *> DenominatorsF;
  std::vector<LinearOperatorD *> LinOpD;
  std::vector<LinearOperatorF *> LinOpF; 
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
 #ifdef MIXED_PRECISION
    ////////////////////////////////////////////////////////////////////////////
    // Mixed precision CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,MX_tol,
 			     MX_inner,
 			     MaxCGIterations,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,MX_tol,
 				   MX_inner,
 				   MaxCGIterations,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
    // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
 #else
    ////////////////////////////////////////////////////////////////////////////
    // Standard CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG));
 #endif
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1fRHMC.cc
+++ b/HMC/Mobius2p1fRHMC.cc
@ -52,16 +52,16 @@ int main(int argc, char **argv) {
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 20;
+  MD.MDsteps = 12;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 0;
+  HMCparams.StartTrajectory  = 139;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("ColdStart");
-  //  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
@ -71,7 +71,7 @@ int main(int argc, char **argv) {
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
-  CPparams.saveInterval  = 10;
+  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@ -130,7 +130,7 @@ int main(int argc, char **argv) {
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(4);
+  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // Strange action
--- a/HMC/Mobius2p1fRHMC_4dPseudoFermion.cc
+++ b/HMC/Mobius2p1fRHMC_4dPseudoFermion.cc
@ -0,0 +1,197 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 10;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 137;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0;
  RealD c   = 0.0;
  // FIXME:
  // Same in MC and MD
  // Need to mix precision too
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 4.0e-3;
  OFRp.hi       = 30.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 16;
  OFRp.precision= 50;
  std::vector<Real> hasenbusch({ 0.1 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  //  FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params);
  //  DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5);
  //  DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5);
  //  ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false);
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
  Level1.push_back(&StrangePseudoFermion);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1fRHMC_4dPseudoFermionSchurSolver.cc
+++ b/HMC/Mobius2p1fRHMC_4dPseudoFermionSchurSolver.cc
@ -0,0 +1,198 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/action/pseudofermion/TwoFlavourRatioEO4DPseudoFermion.h>
 #include <Grid/qcd/action/pseudofermion/TwoFlavourRatio4DPseudoFermion.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 12;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 211;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_4dDWF_lat";
  CPparams.rng_prefix    = "ckpoint_4dDWF_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0;
  RealD c   = 0.0;
  // FIXME:
  // Same in MC and MD
  // Need to mix precision too
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 4.0e-3;
  OFRp.hi       = 30.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 16;
  OFRp.precision= 50;
  std::vector<Real> hasenbusch({ 0.1 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  //  FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params);
  //  DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5);
  //  DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5);
  //  ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false);
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
  Level1.push_back(&StrangePseudoFermion);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourRatioEO4DPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourRatioEO4DPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/10
+++ b/10
@ -1,5 +1,11 @@
-- comms threads issue??
+--
-- Part done: Staggered kernel performance on GPU
+-- Comms threads issue??
 -- Part done: Staggered kernel performance on GPU ; eliminate replicas
 -- Antonin - Nd, Nc generic hide and make Gimpl
 -- DWF 5d RB case / Shamir
 -- 4D pseudofermion options
 -- DDHMC
 --
 =========================================================
 General
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@ -32,18 +32,14 @@
 using namespace std;
 using namespace Grid;
-template<class d>
+typedef DirichletFermionOperator<WilsonImplF> DirichletFermionF;
 struct scal {
  d internal;
 };
-  Gamma::Algebra Gmu [] = {
+Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
-  };
+};
 int main (int argc, char ** argv)
 {
@ -61,20 +57,17 @@ int main (int argc, char ** argv)
  GridLogLayout();
  Coordinate latt = GridDefaultLatt();
  Coordinate mpi  = GridDefaultMpi();
  Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd());
  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
-
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt,simd,mpi);
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
@ -292,7 +285,6 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
@ -359,6 +351,56 @@ int main (int argc, char ** argv)
  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DirichletFermionF::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  // Dirichlet benchmark
  Coordinate local(Nd);
  for(int d=0;d<Nd;d++){
    local[d] = latt[d]/mpi[d];
  }
  std::vector<Complex> boundary = {1,1,1,-1};
  DomainWallFermionF::ImplParams DirichletParams(boundary);
  DirichletParams.locally_periodic=true;
  DomainWallFermionF DwDirichlet(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,DirichletParams);
  DirichletFermionF Dirichlet(DwDirichlet,local);
  Dirichlet.ImportGauge(Umu);
  {
    FGrid->Barrier();
    Dirichlet.DhopEO(src_o,r_e,DaggerNo);
    DwDirichlet.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dirichlet.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=(single_site_flops*volume*ncall)/2.0;
    std::cout<<GridLogMessage << "DirichletDeo flop "<< flops<<" usec " <<(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "DirichletDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "DirichletDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "DirichletDeo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
    DwDirichlet.Report();
  }
  Grid_finalize();
  exit(0);
 }
--- a/examples/Example_wall_wall_spectrum.cc
+++ b/examples/Example_wall_wall_spectrum.cc
@ -9,6 +9,7 @@ using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
@ -55,6 +56,15 @@ void MakePhase(Coordinate mom,LatticeComplex &phase)
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
@ -97,23 +107,23 @@ void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
-  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Ufix);
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
@ -287,15 +297,10 @@ int main (int argc, char ** argv)
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  std::vector<int> seeds4({1,2,3,4}); 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  LatticeGaugeField Ufixed(UGrid);
+  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
@ -308,13 +313,17 @@ int main (int argc, char ** argv)
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
-    //    SU<Nc>::HotConfiguration(RNG4,Umu);
+    config="ColdConfig";
    config="HotConfig";
  }
-  GaugeFix(Umu,Ufixed);
+  GaugeFix(Umu,Utmp);
-  Umu=Ufixed;
+  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  LinkSmear(nsmr,rho,Umu,Usmr);
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
@ -339,23 +348,29 @@ int main (int argc, char ** argv)
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
-    FermActs.push_back(new MobiusFermionR(Umu,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
  }
  LatticePropagator point_source(UGrid);
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
-  Coordinate Origin({0,0,0,0});
+  int tslice = 0;
-  PointSource   (Origin,point_source);
+  //////////////////////////////////////////////////////////////////////
-  Z2WallSource  (RNG4,0,z2wall_source);
+  // RNG seeded for Z2 wall
-  GFWallSource  (0,gfwall_source);
+  //////////////////////////////////////////////////////////////////////
-  
+  // You can manage seeds however you like.
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  // Recommend SeedUniqueString.
-  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
+  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  std::vector<LatticePropagator> GFProps   (nmass,UGrid);
--- a/scripts/hmc.sh
+++ b/scripts/hmc.sh
@ -1,19 +1,27 @@
 #!/bin/bash
 LOG=$1
-SWEEPS=`grep dH $LOG | wc -l`
+SWEEPS=`grep dH.= $LOG | wc -l`
-SWEEPS=`expr $SWEEPS - 80`
+SWEEPS=`expr $SWEEPS - 100`
 echo
 echo $SWEEPS thermalised sweeps
 echo
-plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10} END { print S/NR} ' `
+plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12} END { print S/NR} ' `
-plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12 ; SS=SS+$12*$12 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Plaquette: $plaq (${plaqe})"
 echo
-dHv=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt(SS/NR) } ' `
+grep  Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12/20; if(NR%20==0){ print NR/20, " ", S; S=0;} } '  > plaq.binned
-edH=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$10)} END { print S/NR} '`
+
-echo "<e-dH>: $edH"
+plaq=`cat plaq.binned  | awk '{ S=S+$2} END { print S/NR} ' `
 plaqe=`cat plaq.binned | awk '{ S=S+$2 ; SS=SS+$2*$2 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Binned Plaquette: $plaq (${plaqe})"
 echo
 dHv=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+$16 ; SS=SS+$16*$16 } END { print sqrt(SS/NR) } ' `
 edH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16)} END { print S/NR} '`
 dedH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16); SS=SS+exp(-$16)*exp(-$16)} END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } '`
 echo "<e-dH>: $edH (${dedH})"
 echo "<rms dH>: $dHv"
 TRAJ=`grep Acc $LOG | wc -l`
@ -22,12 +30,13 @@ PACC=`expr  100 \* ${ACC} / ${TRAJ} `
 echo
 echo "Acceptance $PACC %  $ACC / $TRAJ "
-grep Plaq $LOG | awk '{ print $10 }' | uniq > plaq.dat
+grep Plaq $LOG | awk '{ print $12 }' | uniq > plaq.dat
-grep dH $LOG | awk '{ print $10 }' > dH.dat
+grep dH.= $LOG | awk '{ print $16 }' > dH.dat
-echo set yrange [-0.2:1.0] > plot.gnu
+echo set yrange [0.58:0.60] > plot.gnu
 echo set terminal 'pdf' >> plot.gnu
 echo "f(x) =0.588" >> plot.gnu
 echo "set output 'plaq.${LOG}.pdf'" >> plot.gnu
-echo "plot 'plaq.dat' w l, 'dH.dat' w l " >> plot.gnu
+echo "plot 'plaq.dat' w l, f(x) " >> plot.gnu
 echo
 gnuplot plot.gnu >& gnu.errs
 open plaq.${LOG}.pdf
--- a/systems/Tursa/config-command
+++ b/systems/Tursa/config-command
@ -1,11 +1,14 @@
 PREFIX=/home/tc002/tc002/shared/env/prefix/
 ../../configure \
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-shm=nvlink \
    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
-    --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
+    --with-hdf5=$PREFIX \
-    --disable-accelerator-cshift \
+    --with-lime=$PREFIX \
    --with-fftw=$PREFIX \
    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
--- a/systems/Tursa/dwf.4node.perf
+++ b/systems/Tursa/dwf.4node.perf
@ -1,245 +0,0 @@
 tu-c0r0n00 - 0 device=0 binding=--interleave=0,1
 tu-c0r0n00 - 1 device=1 binding=--interleave=2,3
 tu-c0r0n09 - 1 device=1 binding=--interleave=2,3
 tu-c0r0n00 - 2 device=2 binding=--interleave=4,5
 tu-c0r0n06 - 0 device=0 binding=--interleave=0,1
 tu-c0r0n06 - 1 device=1 binding=--interleave=2,3
 tu-c0r0n09 - 0 device=0 binding=--interleave=0,1
 tu-c0r0n09 - 2 device=2 binding=--interleave=4,5
 tu-c0r0n03 - 1 device=1 binding=--interleave=2,3
 tu-c0r0n06 - 2 device=2 binding=--interleave=4,5
 tu-c0r0n09 - 3 device=3 binding=--interleave=6,7
 tu-c0r0n00 - 3 device=3 binding=--interleave=6,7
 tu-c0r0n03 - 0 device=0 binding=--interleave=0,1
 tu-c0r0n03 - 2 device=2 binding=--interleave=4,5
 tu-c0r0n06 - 3 device=3 binding=--interleave=6,7
 tu-c0r0n03 - 3 device=3 binding=--interleave=6,7
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 42505273344 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 3 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 OPENMPI detected
 AcceleratorCudaInit: using default device 
 AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 16
 SharedMemoryMpi:  Node  communicator of size 4
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fcd80000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=9d2238148c56e3fbadfa95dcabf2b83d4bde14cd: (HEAD -> develop) uncommited changes
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 34004218675 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.198523 s : Grid Layout
 Grid : Message : 1.198530 s : 	Global lattice size  : 64 64 64 64 
 Grid : Message : 1.198534 s : 	OpenMP threads       : 4
 Grid : Message : 1.198535 s : 	MPI tasks            : 2 2 2 2 
 Grid : Message : 1.397615 s : Making s innermost grids
 Grid : Message : 1.441828 s : Initialising 4d RNG
 Grid : Message : 1.547973 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 1.547998 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 1.954777 s : Initialising 5d RNG
 Grid : Message : 3.633825 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 3.633869 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 12.162710 s : Initialised RNGs
 Grid : Message : 15.882520 s : Drawing gauge field
 Grid : Message : 15.816362 s : Random gauge initialised 
 Grid : Message : 17.279671 s : Setting up Cshift based reference 
 Grid : Message : 26.331426 s : *****************************************************************
 Grid : Message : 26.331452 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 26.331454 s : *****************************************************************
 Grid : Message : 26.331456 s : *****************************************************************
 Grid : Message : 26.331458 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 26.331459 s : * Vectorising space-time by 8
 Grid : Message : 26.331463 s : * VComplexF size is 64 B
 Grid : Message : 26.331465 s : * SINGLE precision 
 Grid : Message : 26.331467 s : * Using Overlapped Comms/Compute
 Grid : Message : 26.331468 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 26.331469 s : *****************************************************************
 Grid : Message : 28.413717 s : Called warmup
 Grid : Message : 56.418423 s : Called Dw 3000 times in 2.80047e+07 us
 Grid : Message : 56.418476 s : mflop/s =   3.79581e+07
 Grid : Message : 56.418479 s : mflop/s per rank =  2.37238e+06
 Grid : Message : 56.418481 s : mflop/s per node =  9.48953e+06
 Grid : Message : 56.418483 s : RF  GiB/s (base 2) =   77130
 Grid : Message : 56.418485 s : mem GiB/s (base 2) =   48206.3
 Grid : Message : 56.422076 s : norm diff   1.03481e-13
 Grid : Message : 56.456894 s : #### Dhop calls report 
 Grid : Message : 56.456899 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 56.456903 s : WilsonFermion5D TotalTime   /Calls        : 4710.93 us
 Grid : Message : 56.456905 s : WilsonFermion5D CommTime    /Calls        : 3196.15 us
 Grid : Message : 56.456908 s : WilsonFermion5D FaceTime    /Calls        : 494.392 us
 Grid : Message : 56.456910 s : WilsonFermion5D ComputeTime1/Calls        : 44.4107 us
 Grid : Message : 56.456912 s : WilsonFermion5D ComputeTime2/Calls        : 1037.75 us
 Grid : Message : 56.456921 s : Average mflops/s per call                : 3.55691e+09
 Grid : Message : 56.456925 s : Average mflops/s per call per rank       : 2.22307e+08
 Grid : Message : 56.456928 s : Average mflops/s per call per node       : 8.89228e+08
 Grid : Message : 56.456930 s : Average mflops/s per call (full)         : 3.82915e+07
 Grid : Message : 56.456933 s : Average mflops/s per call per rank (full): 2.39322e+06
 Grid : Message : 56.456952 s : Average mflops/s per call per node (full): 9.57287e+06
 Grid : Message : 56.456954 s : WilsonFermion5D Stencil
 Grid : Message : 56.457016 s :  Stencil calls 3001
 Grid : Message : 56.457022 s :  Stencil halogtime 0
 Grid : Message : 56.457024 s :  Stencil gathertime 55.9154
 Grid : Message : 56.457026 s :  Stencil gathermtime 20.1073
 Grid : Message : 56.457028 s :  Stencil mergetime 18.5585
 Grid : Message : 56.457030 s :  Stencil decompresstime 0.0639787
 Grid : Message : 56.457032 s :  Stencil comms_bytes 4.02653e+08
 Grid : Message : 56.457034 s :  Stencil commtime 6379.93
 Grid : Message : 56.457036 s :  Stencil 63.1124 GB/s per rank
 Grid : Message : 56.457038 s :  Stencil 252.45 GB/s per node
 Grid : Message : 56.457040 s : WilsonFermion5D StencilEven
 Grid : Message : 56.457048 s : WilsonFermion5D StencilOdd
 Grid : Message : 56.457062 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 56.457065 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 56.457066 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 79.259261 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 79.259287 s : Called DwDag
 Grid : Message : 79.259288 s : norm dag result 12.0421
 Grid : Message : 79.271740 s : norm dag ref    12.0421
 Grid : Message : 79.287759 s : norm dag diff   7.63236e-14
 Grid : Message : 79.328100 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 79.955951 s : src_e0.499997
 Grid : Message : 80.633620 s : src_o0.500003
 Grid : Message : 80.164163 s : *********************************************************
 Grid : Message : 80.164168 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 80.164170 s : * Vectorising space-time by 8
 Grid : Message : 80.164172 s : * SINGLE precision 
 Grid : Message : 80.164174 s : * Using Overlapped Comms/Compute
 Grid : Message : 80.164177 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 80.164178 s : *********************************************************
 Grid : Message : 93.797635 s : Deo mflop/s =   3.93231e+07
 Grid : Message : 93.797670 s : Deo mflop/s per rank   2.45769e+06
 Grid : Message : 93.797672 s : Deo mflop/s per node   9.83077e+06
 Grid : Message : 93.797674 s : #### Dhop calls report 
 Grid : Message : 93.797675 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 93.797677 s : WilsonFermion5D TotalTime   /Calls        : 4542.83 us
 Grid : Message : 93.797679 s : WilsonFermion5D CommTime    /Calls        : 2978.97 us
 Grid : Message : 93.797681 s : WilsonFermion5D FaceTime    /Calls        : 602.287 us
 Grid : Message : 93.797683 s : WilsonFermion5D ComputeTime1/Calls        : 67.1416 us
 Grid : Message : 93.797685 s : WilsonFermion5D ComputeTime2/Calls        : 1004.07 us
 Grid : Message : 93.797713 s : Average mflops/s per call                : 3.30731e+09
 Grid : Message : 93.797717 s : Average mflops/s per call per rank       : 2.06707e+08
 Grid : Message : 93.797719 s : Average mflops/s per call per node       : 8.26827e+08
 Grid : Message : 93.797721 s : Average mflops/s per call (full)         : 3.97084e+07
 Grid : Message : 93.797727 s : Average mflops/s per call per rank (full): 2.48178e+06
 Grid : Message : 93.797732 s : Average mflops/s per call per node (full): 9.92711e+06
 Grid : Message : 93.797735 s : WilsonFermion5D Stencil
 Grid : Message : 93.797746 s : WilsonFermion5D StencilEven
 Grid : Message : 93.797758 s : WilsonFermion5D StencilOdd
 Grid : Message : 93.797769 s :  Stencil calls 3001
 Grid : Message : 93.797773 s :  Stencil halogtime 0
 Grid : Message : 93.797776 s :  Stencil gathertime 56.7458
 Grid : Message : 93.797780 s :  Stencil gathermtime 22.6504
 Grid : Message : 93.797782 s :  Stencil mergetime 21.1913
 Grid : Message : 93.797786 s :  Stencil decompresstime 0.0556481
 Grid : Message : 93.797788 s :  Stencil comms_bytes 2.01327e+08
 Grid : Message : 93.797791 s :  Stencil commtime 2989.33
 Grid : Message : 93.797795 s :  Stencil 67.3484 GB/s per rank
 Grid : Message : 93.797798 s :  Stencil 269.394 GB/s per node
 Grid : Message : 93.797801 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 93.797803 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 93.797805 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 93.873429 s : r_e6.02111
 Grid : Message : 93.879931 s : r_o6.02102
 Grid : Message : 93.885912 s : res12.0421
 Grid : Message : 94.876555 s : norm diff   0
 Grid : Message : 95.485643 s : norm diff even  0
 Grid : Message : 95.581236 s : norm diff odd   0
--- a/systems/Tursa/dwf16.slurm
+++ b/systems/Tursa/dwf16.slurm
@ -29,5 +29,14 @@ mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH  --bind-to none ./mpiwrapper.sh \
 	--mpi 2.2.2.8 \
       	--accelerator-threads 8 \
 	--grid 64.64.64.256 \
-	--shm 2048 > dwf.16node.perf
+	--shm 2048 
 mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH  --bind-to none ./mpiwrapper.sh \
       	./benchmarks/Benchmark_dwf_fp32 \
 	$OPT \
 	--mpi 2.2.2.8 \
       	--accelerator-threads 8 \
 	--grid 48.48.48.192 \
 	--shm 2048
--- a/systems/Tursa/dwf4.slurm
+++ b/systems/Tursa/dwf4.slurm
@ -34,5 +34,16 @@ mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH  --bind-to none \
 	--shm 2048 > dwf.4node.perf
 mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH  --bind-to none \
 	./mpiwrapper.sh \
 	./benchmarks/Benchmark_dwf_fp32 \
 	$OPT \
 	--mpi 2.2.2.2 \
 	--accelerator-threads 8 \
 	--grid 64.64.64.64 \
 	--shm 2048 > comms.4node.perf
--- a/systems/Tursa/sourceme.sh
+++ b/systems/Tursa/sourceme.sh
@ -1,2 +1,5 @@
 spack load c-lime
 module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
 export PREFIX=/home/tc002/tc002/shared/env/prefix/
 export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
 unset SBATCH_EXPORT
--- a/tests/core/Test_ddhmc_matrices.cc
+++ b/tests/core/Test_ddhmc_matrices.cc
@ -0,0 +1,511 @@
   /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_ddhmc_matrices.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  const int Ls=8;
  const int Nt=32;
  auto latt = GridDefaultLatt();
  latt[3] = Nt;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(latt, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); gaussian(RNG5,src);
  LatticeFermion phi   (FGrid); gaussian(RNG5,phi);
  LatticeFermion chi   (FGrid); gaussian(RNG5,chi);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid);    ref=Zero();
  LatticeFermion    tmp(FGrid);    tmp=Zero();
  LatticeFermion    tmp1(FGrid);
  LatticeFermion    err(FGrid);    tmp=Zero();
  LatticeFermion    zz(FGrid);     zz =Zero();
  LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
  LatticeGaugeFieldF UmuF(UGridF);
  precisionChange(UmuF,Umu);
  RealD mass=0.1;
  RealD M5  =1.8;
  DomainWallFermionR DdwfPeri(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionF DdwfPeriF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5);
  typedef DomainWallFermionR::Impl_t FimplD;
  typedef DomainWallFermionF::Impl_t FimplF;
  typedef DirichletFermionOperator<FimplD> FermOp;
  typedef DirichletFermionOperator<FimplF> FermOpF;
  Coordinate Block({0,0,0,Nt/2});
  DomainWallFermionR DdwfPeriTmp(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionF DdwfPeriTmpF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5);
  FermOp  Ddwf(DdwfPeriTmp,Block); 
  FermOpF DdwfF(DdwfPeriTmpF,Block); 
  Ddwf.ImportGauge(Umu);
  DdwfF.ImportGauge(UmuF);
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
  LatticeFermion r_o   (FrbGrid);
  LatticeFermion r_eo  (FGrid);
  LatticeFermion r_eeoo(FGrid);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that Meo + Moe + Moo + Mee = Munprec "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  Ddwf.Meooe(src_e,r_o);  std::cout<<GridLogMessage<<"Applied Meo"<<std::endl;
  Ddwf.Meooe(src_o,r_e);  std::cout<<GridLogMessage<<"Applied Moe"<<std::endl;
  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);
  Ddwf.Mooee(src_e,r_e);  std::cout<<GridLogMessage<<"Applied Mee"<<std::endl;
  Ddwf.Mooee(src_o,r_o);  std::cout<<GridLogMessage<<"Applied Moo"<<std::endl;
  setCheckerboard(r_eeoo,r_e);
  setCheckerboard(r_eeoo,r_o);
  r_eo=r_eo+r_eeoo;
  Ddwf.M(src,ref);  
  //  std::cout<<GridLogMessage << r_eo<<std::endl;
  //  std::cout<<GridLogMessage << ref <<std::endl;
  err= ref - r_eo;
  std::cout<<GridLogMessage << "EO norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(r_eo) <<std::endl;
  LatticeComplex cerr(FGrid);
  cerr = localInnerProduct(err,err);
  //  std::cout<<GridLogMessage << cerr<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test Ddagger is the dagger of D by requiring                "<<std::endl;
  std::cout<<GridLogMessage<<"=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  LatticeFermion chi_e   (FrbGrid);
  LatticeFermion chi_o   (FrbGrid);
  LatticeFermion dchi_e  (FrbGrid);
  LatticeFermion dchi_o  (FrbGrid);
  LatticeFermion phi_e   (FrbGrid);
  LatticeFermion phi_o   (FrbGrid);
  LatticeFermion dphi_e  (FrbGrid);
  LatticeFermion dphi_o  (FrbGrid);
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  Ddwf.Meooe(chi_e,dchi_o);
  Ddwf.Meooe(chi_o,dchi_e);
  Ddwf.MeooeDag(phi_e,dphi_o);
  Ddwf.MeooeDag(phi_o,dphi_e);
  ComplexD pDce = innerProduct(phi_e,dchi_e);
  ComplexD pDco = innerProduct(phi_o,dchi_o);
  ComplexD cDpe = innerProduct(chi_e,dphi_e);
  ComplexD cDpo = innerProduct(chi_o,dphi_o);
  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test MeeInv Mee = 1                                         "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  Ddwf.Mooee(chi_e,src_e);
  Ddwf.MooeeInv(src_e,phi_e);
  Ddwf.Mooee(chi_o,src_o);
  Ddwf.MooeeInv(src_o,phi_o);
  setCheckerboard(phi,phi_e);
  setCheckerboard(phi,phi_o);
  err = phi-chi;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test MeeInvDag MeeDag = 1                                   "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  Ddwf.MooeeDag(chi_e,src_e);
  Ddwf.MooeeInvDag(src_e,phi_e);
  Ddwf.MooeeDag(chi_o,src_o);
  Ddwf.MooeeInvDag(src_o,phi_o);
  setCheckerboard(phi,phi_e);
  setCheckerboard(phi,phi_o);
  err = phi-chi;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Test MpcDagMpc is Hermitian              "<<std::endl;
  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
  gaussian(RNG5,phi);
  gaussian(RNG5,chi);
  pickCheckerboard(Even,chi_e,chi);
  pickCheckerboard(Odd ,chi_o,chi);
  pickCheckerboard(Even,phi_e,phi);
  pickCheckerboard(Odd ,phi_o,phi);
  RealD t1,t2;
  SchurDiagMooeeOperator<FermOp,LatticeFermion> HermOpEO(Ddwf);
  HermOpEO.MpcDagMpc(chi_e,dchi_e);
  HermOpEO.MpcDagMpc(chi_o,dchi_o);
  HermOpEO.MpcDagMpc(phi_e,dphi_e);
  HermOpEO.MpcDagMpc(phi_o,dphi_o);
  pDce = innerProduct(phi_e,dchi_e);
  pDco = innerProduct(phi_o,dchi_o);
  cDpe = innerProduct(chi_e,dphi_e);
  cDpo = innerProduct(chi_o,dphi_o);
  std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
  std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
  std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDco-conj(cDpo) <<std::endl;
  std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDce-conj(cDpe) <<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing one direction at a time "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = Zero();
  for(int mu=0;mu<Nd;mu++){
    std::vector<TComplex> slice_ref;
    std::vector<TComplex> slice_result;
    // 5D - Ls is innermost This now annoys me.
    DdwfPeri.Mdir(src,ref   ,mu+1,-1);
    Ddwf.Mdir(src,result,mu+1,-1);
    tmp = tmp + result;
    auto lip = localInnerProduct(result,result);
    ref = ref - result;
    auto dip = localInnerProduct(ref,ref);
    sliceSum(lip,slice_result,mu+1);
    sliceSum(dip,slice_ref,mu+1);
    for(int t=0;t<latt[mu];t++){
      std::cout << "mu="<<mu<<" result["<<t<<"] "<<slice_result[t]<<" delta "<<slice_ref[t]<<std::endl;
      //      if( (t%Block[mu]) !=0) assert(norm2(slice_ref[t]) < 1.0e-10);
      //      else assert(norm2(slice_result[t]) == 0.0);
    }
    // Opposite dir
    DdwfPeri.Mdir(src,ref   ,mu+1,1);
    Ddwf.Mdir(src,result,mu+1,1);
    tmp = tmp + result;
    lip = localInnerProduct(result,result);
    ref = ref - result;
    dip = localInnerProduct(ref,ref);
    sliceSum(lip,slice_result,mu+1);
    sliceSum(dip,slice_ref,mu+1);
    for(int t=0;t<latt[mu];t++){
      std::cout << "mu="<<mu<<" result["<<t<<"] "<<slice_result[t]<<" delta "<<slice_ref[t]<<std::endl;
      //if( (t%Block[mu]) != Block[mu]-1) assert(norm2(slice_ref[t]) < 1.0e-10);
      //else assert(norm2(slice_result[t]) == 0.0);
    }
  }
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  Ddwf.Meooe(src_e,r_o);  std::cout<<GridLogMessage<<"Applied Meo"<<std::endl;
  Ddwf.Meooe(src_o,r_e);  std::cout<<GridLogMessage<<"Applied Moe"<<std::endl;
  setCheckerboard(r_eo,r_o);
  setCheckerboard(r_eo,r_e);
  ref = r_eo - tmp;
  std::cout << " Difference between Moffdiag and sum over directions is "<<norm2(ref)<<std::endl;
  assert(norm2(ref)<1.0e-10);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that POmega+POmegaBar = 1 "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  SchurFactoredFermionOperator<FimplD,FimplF> Schur(DdwfPeri,DdwfPeriF,
 						    Ddwf,DdwfF,
 						    Block);
  result = src;
  Schur.ProjectOmega(result);
  DumpSliceNorm("Omega",result,Nd);
  tmp = src;
  Schur.ProjectOmegaBar(tmp);
  DumpSliceNorm("OmegaBar",tmp,Nd);
  std::cout << " norm2(src) "<<norm2(src)<< " "<< norm2(result)<<" "<<norm2(tmp)<<std::endl;
  result = result + tmp - src;
  std::cout << " diff = "<<norm2(result)<<std::endl;
  assert(norm2(result)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that dBoundary+dBoundaryBar+dOmega+dOmegaBar = Munprec "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  Schur.dBoundary    (src,tmp); result=tmp;        std::cout << "dBoundary    "<<norm2(tmp)<<std::endl;
  DumpSliceNorm("dBoundary",tmp,Nd);
  Schur.dBoundaryBar (src,tmp); result=result+tmp; std::cout << "dBoundaryBar "<<norm2(tmp)<<std::endl;
  DumpSliceNorm("dBoundaryBar",tmp,Nd);
  Schur.dOmega       (src,tmp); result=result+tmp; std::cout << "dOmega       "<<norm2(tmp)<<std::endl;
  DumpSliceNorm("dOmega",tmp,Nd);
  Schur.dOmegaBar    (src,tmp); result=result+tmp; std::cout << "dOmegaBar    "<<norm2(tmp)<<std::endl;
  DumpSliceNorm("dOmegaBar",tmp,Nd);
  DdwfPeri.M(src,ref);  
  err= ref - result;
  std::cout<<GridLogMessage << " norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(result) <<std::endl;
  assert(norm2(err)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that (dBoundary+dBoundaryBar+dOmega+dOmegaBar)dag = Mdag "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  Schur.dBoundaryDag    (src,tmp); result=tmp;        std::cout << "dBoundaryDag    "<<norm2(tmp)<<std::endl;
  Schur.dBoundaryBarDag (src,tmp); result=result+tmp; std::cout << "dBoundaryBarDag "<<norm2(tmp)<<std::endl;
  Schur.dOmegaDag       (src,tmp); result=result+tmp; std::cout << "dOmegaDag       "<<norm2(tmp)<<std::endl;
  Schur.dOmegaBarDag    (src,tmp); result=result+tmp; std::cout << "dOmegaBarDag    "<<norm2(tmp)<<std::endl;
  DdwfPeri.Mdag(src,ref);  
  err= ref - result;
  std::cout<<GridLogMessage << " norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(result) <<std::endl;
  assert(norm2(err)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that <chi|dBoundary|phi> =  <phi|dBoundaryDag|chi>^* "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  Schur.dBoundary(phi,tmp);       std::cout << "<chi|dBoundary|phi>"<<innerProduct(chi,tmp)<<std::endl;
  Schur.dBoundaryDag(chi,tmp);    std::cout << "<phi|dBoundaryDag|chi>"<<innerProduct(phi,tmp)<<std::endl;
  Schur.dBoundaryBar(phi,tmp);    std::cout << "<chi|dBoundaryBar|phi>"<<innerProduct(chi,tmp)<<std::endl;
  Schur.dBoundaryBarDag(chi,tmp); std::cout << "<phi|dBoundaryBarDag|chi>"<<innerProduct(phi,tmp)<<std::endl;
  Schur.dOmega(phi,tmp);       std::cout << "<chi|dOmega|phi>"<<innerProduct(chi,tmp)<<std::endl;
  Schur.dOmegaDag(chi,tmp);    std::cout << "<phi|dOmegaDag|chi>"<<innerProduct(phi,tmp)<<std::endl;
  Schur.dOmegaBar(phi,tmp);    std::cout << "<chi|dOmegaBar|phi>"<<innerProduct(chi,tmp)<<std::endl;
  Schur.dOmegaBarDag(chi,tmp); std::cout << "<phi|dOmegaBarDag|chi>"<<innerProduct(phi,tmp)<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that dBoundary ProjectBoundary = dBoundary "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = src;
  Schur.ProjectBoundary(tmp);
  Schur.dBoundary(tmp,result);
  Schur.dBoundary(src,tmp);
  result=result - tmp;
  std::cout << " diff = "<<norm2(result)<< " result "<<norm2(tmp)<<" "<<norm2(src)<<std::endl;
  assert(norm2(result)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that dBoundaryBar ProjectBoundaryBar = dBoundaryBar "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = src;
  Schur.ProjectBoundaryBar(tmp);
  Schur.dBoundaryBar(tmp,result);
  Schur.dBoundaryBar(src,tmp);
  result=result - tmp;
  std::cout << " diff = "<<norm2(result)<< " result "<<norm2(tmp)<<std::endl;
  assert(norm2(result)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that dOmega dOmegaInv = 1 "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = src;
  Schur.ProjectOmega(tmp);
  Schur.dOmega(tmp,tmp1);
  Schur.dOmegaInv(tmp1,result);
  tmp=tmp-result;
  std::cout << " diff = "<<norm2(tmp)<< " result "<<norm2(result)<<std::endl;
  assert(norm2(tmp)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that dOmegaBar dOmegaBarInv = 1 "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = src;
  Schur.ProjectOmegaBar(tmp);
  Schur.dOmegaBar(tmp,tmp1);
  Schur.dOmegaBarInv(tmp1,result);
  tmp=tmp-result;
  std::cout << " diff = "<<norm2(tmp)<< " result "<<norm2(result)<<std::endl;
  assert(norm2(tmp)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that dOmegaDag dOmegaDagInv = 1 "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = src;
  Schur.ProjectOmega(tmp);
  Schur.dOmegaDag(tmp,tmp1);
  Schur.dOmegaDagInv(tmp1,result);
  tmp=tmp-result;
  std::cout << " diff = "<<norm2(tmp)<< " result "<<norm2(result)<<std::endl;
  assert(norm2(tmp)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that dOmegaBarDag dOmegaBarDagInv = 1 "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = src;
  Schur.ProjectOmegaBar(tmp);
  Schur.dOmegaBarDag(tmp,tmp1);
  Schur.dOmegaBarDagInv(tmp1,result);
  tmp=tmp-result;
  std::cout << " diff = "<<norm2(tmp)<< " result "<<norm2(result)<<std::endl;
  assert(norm2(tmp)<=1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that R RInv = PboundaryBar "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  LatticeFermion Rphi   (FGrid);
  LatticeFermion Rdagchi(FGrid);
  tmp = phi;
  Schur.R(tmp,Rphi);
  Schur.RInv(Rphi,result);
  tmp = phi;
  Schur.ProjectBoundaryBar(tmp);
  //  std::cout << "Project Boundary Bar" << tmp<< std::endl;
  tmp=tmp-result;
  std::cout << " diff = "<<norm2(tmp)<< " result "<<norm2(result)<<std::endl;
  assert(norm2(tmp)<1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that Rdag RInvdag = PboundaryBar "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  tmp = chi;
  Schur.RDag(tmp,Rdagchi);
  Schur.RDagInv(Rdagchi,result);
  tmp = chi;
  Schur.ProjectBoundaryBar(tmp);
  tmp=tmp-result;
  std::cout << " diff = "<<norm2(tmp)<< " result "<<norm2(result)<<std::endl;
  assert(norm2(tmp)<1.0e-8);
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing that <chi|R|phi> = <phi|Rdag|chi>* "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout << "<chi|R|phi>"<<innerProduct(chi,Rphi)<<std::endl;
  std::cout << "<phi|Rdag|chi>"<<innerProduct(phi,Rdagchi)<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  std::cout<<GridLogMessage<<"= Testing the sliced evolution of spin structured noise   "<<std::endl;
  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
  int hits=2;
  int isDWF=1;
  std::cout << " latt " << latt <<" Nd "<<FGrid->Nd()<<" dims "<<FGrid->GlobalDimensions()<<std::endl;
  LatticeInteger coor(FGrid);
  for(int mu=0;mu<Nd;mu++){
    Gamma G(Gmu[mu]);
    int plane = latt[mu]/2;
    for(int hit=0;hit<hits;hit++){
      std::cout<<GridLogMessage<<"mu="<<mu<<" hit "<<hit<<std::endl;
      LatticeCoordinate(coor,mu+isDWF);
      gaussian(RNG5,src);
      tmp = src - G*src;
      src = src + G*src;
      src= where(coor==Integer(plane),src,zz);
      src= where(coor==Integer(0),tmp,src);
      Schur.Dinverse(src,tmp);  
      DumpSliceNorm("1+/-gamma_mu",tmp,mu+isDWF);
    }
  }
  Grid_finalize();
 }
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@ -299,12 +299,12 @@ int main (int argc, char ** argv)
    SpinColourVectorD ferm; gaussian(sRNG,ferm);
    pokeSite(ferm,src,point);
-    const int Ls=32;
+    const int Ls=64;
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
-    RealD mass=0.01;
+    RealD mass=1.0;
-    RealD M5  =0.8;
+    RealD M5  =0.99;
    DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5);
    // Momentum space prop
@ -353,6 +353,12 @@ int main (int argc, char ** argv)
    std::cout << " Taking difference" <<std::endl;
    std::cout << "Ddwf result4 "<<norm2(result4)<<std::endl;
    std::cout << "Ddwf ref     "<<norm2(ref)<<std::endl;
    auto twopoint = localInnerProduct(result4,result4);
    std::vector<TComplex> pion_prop;
    sliceSum(twopoint,pion_prop,Nd-1);
    for(int t=0;t<pion_prop.size();t++){
      std::cout << "Pion_prop["<<t<<"]="<<pion_prop[t]<<std::endl;
    }
    diff = ref - result4;
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
@ -383,7 +389,7 @@ int main (int argc, char ** argv)
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
-    RealD mass=0.01;
+    RealD mass=1.0;
    RealD M5  =0.8;
    OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
--- a/tests/core/Test_gparity.cc
+++ b/tests/core/Test_gparity.cc
@ -55,13 +55,17 @@ static_assert(same_vComplex == 1, "Dirac Operators must have same underlying SIM
 int main (int argc, char ** argv)
 {
  int nu = 0;
-
+  int tbc_aprd = 0; //use antiperiodic BCs in the time direction?
  Grid_init(&argc,&argv);
  for(int i=1;i<argc;i++){
    if(std::string(argv[i]) == "--Gparity-dir"){
      std::stringstream ss; ss << argv[i+1]; ss >> nu;
      std::cout << GridLogMessage << "Set Gparity direction to " << nu << std::endl;
    }else if(std::string(argv[i]) == "--Tbc-APRD"){
      tbc_aprd = 1;
      std::cout << GridLogMessage << "Using antiperiodic BCs in the time direction" << std::endl;
    }
  }
@ -155,13 +159,18 @@ int main (int argc, char ** argv)
  //Coordinate grid for reference
  LatticeInteger    xcoor_1f5(FGrid_1f);
-  LatticeCoordinate(xcoor_1f5,1+nu);
+  LatticeCoordinate(xcoor_1f5,1+nu); //note '1+nu'! This is because for 5D fields the s-direction is direction 0
  Replicate(src,src_1f);
  src_1f   = where( xcoor_1f5 >= Integer(L), 2.0*src_1f,src_1f );
  RealD mass=0.0;
  RealD M5=1.8;
-  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS);
+
  //Standard Dirac op
  AcceleratorVector<Complex,4> bc_std(Nd, 1.0);
  if(tbc_aprd) bc_std[Nd-1] = -1.; //antiperiodic time BC
  StandardDiracOp::ImplParams std_params(bc_std);
  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS, std_params);
  StandardFermionField    src_o_1f(FrbGrid_1f);
  StandardFermionField result_o_1f(FrbGrid_1f);
@ -172,9 +181,11 @@ int main (int argc, char ** argv)
  ConjugateGradient<StandardFermionField> CG(1.0e-8,10000);
  CG(HermOpEO,src_o_1f,result_o_1f);
-  //  const int nu = 3;
+  //Gparity Dirac op
  std::vector<int> twists(Nd,0);
  twists[nu] = 1;
  if(tbc_aprd) twists[Nd-1] = 1;
  GparityDiracOp::ImplParams params;
  params.twists = twists;
  GparityDiracOp GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5 DOP_PARAMS,params);
@ -271,8 +282,11 @@ int main (int argc, char ** argv)
  std::cout << "2f cb "<<result_o_2f.Checkerboard()<<std::endl;
  std::cout << "1f cb "<<result_o_1f.Checkerboard()<<std::endl;
-  std::cout << " result norms " <<norm2(result_o_2f)<<" " <<norm2(result_o_1f)<<std::endl;
+  //Compare norms
  std::cout << " result norms 2f: " <<norm2(result_o_2f)<<" 1f: " <<norm2(result_o_1f)<<std::endl;
  //Take the 2f solution and convert into the corresponding 1f solution (odd cb only)
  StandardFermionField    res0o  (FrbGrid_2f); 
  StandardFermionField    res1o  (FrbGrid_2f); 
  StandardFermionField    res0  (FGrid_2f); 
@ -281,14 +295,15 @@ int main (int argc, char ** argv)
  res0=Zero();
  res1=Zero();
-  res0o = PeekIndex<0>(result_o_2f,0);
+  res0o = PeekIndex<0>(result_o_2f,0); //flavor 0, odd cb
-  res1o = PeekIndex<0>(result_o_2f,1);
+  res1o = PeekIndex<0>(result_o_2f,1); //flavor 1, odd cb
  std::cout << "res cb "<<res0o.Checkerboard()<<std::endl;
  std::cout << "res cb "<<res1o.Checkerboard()<<std::endl;
-  setCheckerboard(res0,res0o);
+  //poke odd onto non-cb field
-  setCheckerboard(res1,res1o);
+  setCheckerboard(res0,res0o); 
  setCheckerboard(res1,res1o); 
  StandardFermionField replica (FGrid_1f);
  StandardFermionField replica0(FGrid_1f);
@ -296,12 +311,13 @@ int main (int argc, char ** argv)
  Replicate(res0,replica0);
  Replicate(res1,replica1);
  //2nd half of doubled lattice has f=1
  replica = where( xcoor_1f5 >= Integer(L), replica1,replica0 );
  replica0 = Zero();
  setCheckerboard(replica0,result_o_1f);
-  std::cout << "Norm2 solutions is " <<norm2(replica)<<" "<< norm2(replica0)<<std::endl;
+  std::cout << "Norm2 solutions 1f reconstructed from 2f: " <<norm2(replica)<<" Actual 1f: "<< norm2(replica0)<<std::endl;
  replica = replica - replica0;
--- a/tests/core/Test_gparity_flavour.cc
+++ b/tests/core/Test_gparity_flavour.cc
@ -0,0 +1,177 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: ./tests/Test_gparity_flavour.cc
 Copyright (C) 2015-2017
 Author: Christopher Kelly <ckelly@bnl.gov>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 static constexpr double                      tolerance = 1.0e-6;
 static std::array<GparityFlavourMatrix, GparityFlavour::nSigma> testAlgebra;
 void print(const GparityFlavourMatrix &g)
 {
  for(int i = 0; i < Ngp; i++)
  {
    std::cout << GridLogMessage << "(";
    for(int j=0;j<Ngp;j++){
      if ( abs( g(i,j)()() ) == 0 ) {
        std::cout<< " 0";
      } else if ( abs(g(i,j)()() - Complex(0,1)) == 0){
        std::cout<< " i";
      } else if ( abs(g(i,j)()() + Complex(0,1)) == 0){
        std::cout<< "-i";
      } else if ( abs(g(i,j)()() - Complex(1,0)) == 0){
        std::cout<< " 1";
      } else if ( abs(g(i,j)()() + Complex(1,0)) == 0){
        std::cout<< "-1";
      }
      std::cout<<((j == Ngp-1) ? ")" : "," );
    }
    std::cout << std::endl;
  }
  std::cout << GridLogMessage << std::endl;
 }
 void createTestAlgebra(void)
 {
  std::array<GparityFlavourMatrix, 3> testg;
  const Complex             I(0., 1.), mI(0., -1.);
  // 0 1
  // 1 0
  testg[0] = Zero();
  testg[0](0, 1)()() = 1.;
  testg[0](1, 0)()() = 1.;
  std::cout << GridLogMessage << "test SigmaX= " << std::endl;
  print(testg[0]);
  // 0 -i
  // i  0
  testg[1] = Zero();
  testg[1](0, 1)()() = mI;
  testg[1](1, 0)()() = I;
  std::cout << GridLogMessage << "test SigmaY= " << std::endl;
  print(testg[1]);
  // 1  0
  // 0 -1
  testg[2] = Zero();
  testg[2](0, 0)()() = 1.0;
  testg[2](1, 1)()() = -1.0;
  std::cout << GridLogMessage << "test SigmaZ= " << std::endl;
  print(testg[2]);
 #define DEFINE_TEST_G(g, exp)\
 testAlgebra[GparityFlavour::Algebra::g]        = exp; \
 testAlgebra[GparityFlavour::Algebra::Minus##g] = -exp;
  DEFINE_TEST_G(SigmaX      , testg[0]);
  DEFINE_TEST_G(SigmaY      , testg[1]);
  DEFINE_TEST_G(SigmaZ      , testg[2]);
  DEFINE_TEST_G(Identity    , 1.);
  GparityFlavourMatrix pplus;
  pplus = 1.0;
  pplus = pplus + testg[1];
  pplus = pplus * 0.5;
  DEFINE_TEST_G(ProjPlus    , pplus);
  GparityFlavourMatrix pminus;
  pminus = 1.0;
  pminus = pminus - testg[1];
  pminus = pminus * 0.5;
  DEFINE_TEST_G(ProjMinus    , pminus);
 #undef DEFINE_TEST_G
 }
 template <typename Expr>
 void test(const Expr &a, const Expr &b)
 {
  if (norm2(a - b) < tolerance)
  {
    std::cout << "[OK] ";
  }
  else
  {
    std::cout << "[fail]" << std::endl;
    std::cout << GridLogError << "a= " << a << std::endl;
    std::cout << GridLogError << "is different (tolerance= " << tolerance << ") from " << std::endl;
    std::cout << GridLogError << "b= " << b << std::endl;
    exit(EXIT_FAILURE);
  }
 }
 void checkSigma(const GparityFlavour::Algebra a, GridSerialRNG &rng)
 {
  GparityFlavourVector v;
  GparityFlavourMatrix m, &testg = testAlgebra[a];
  GparityFlavour      g(a);
  random(rng, v);
  random(rng, m);
  std::cout << GridLogMessage << "Checking " << GparityFlavour::name[a] << ": ";
  std::cout << "vecmul ";
  test(g*v, testg*v);
  std::cout << "matlmul ";
  test(g*m, testg*m);
  std::cout << "matrmul ";
  test(m*g, m*testg);
  std::cout << std::endl;
 }
 int main(int argc, char *argv[])
 {
  Grid_init(&argc,&argv);
  Coordinate latt_size   = GridDefaultLatt();
  Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
  Coordinate mpi_layout  = GridDefaultMpi();
  GridCartesian Grid(latt_size,simd_layout,mpi_layout);
  GridSerialRNG sRNG;
  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  std::cout << GridLogMessage << "======== Test algebra" << std::endl;
  createTestAlgebra();
  std::cout << GridLogMessage << "======== Multiplication operators check" << std::endl;
  for (int i = 0; i < GparityFlavour::nSigma; ++i)
  {
    checkSigma(i, sRNG);
  }
  std::cout << GridLogMessage << std::endl;
  Grid_finalize();
  return EXIT_SUCCESS;
 }
--- a/tests/core/Test_precision_change.cc
+++ b/tests/core/Test_precision_change.cc
@ -0,0 +1,114 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/core/Test_precision_change.cc
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 int main (int argc, char ** argv){
  Grid_init(&argc, &argv);
  int Ls = 16;
  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " and Ls=" << Ls << std::endl;
  GridCartesian* UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
  GridCartesian* FGrid_d = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_d);
  GridRedBlackCartesian* FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_d);
  GridCartesian* UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
  GridCartesian* FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_f);
  GridRedBlackCartesian* FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_f);
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  GridParallelRNG RNG5(FGrid_d);
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid_d);
  RNG4.SeedFixedIntegers(seeds4);
  //Gauge fields
  LatticeGaugeFieldD Umu_d(UGrid_d);
  LatticeGaugeFieldF Umu_f(UGrid_f);
  LatticeGaugeFieldD Umu_d_r(UGrid_d);
  LatticeGaugeFieldD Utmp_d(UGrid_d);
  for(int i=0;i<5;i++){
    random(RNG4, Umu_d);
    precisionChange(Umu_f, Umu_d);
    std::cout << GridLogMessage << "Norm of double-prec and single-prec gauge fields (should be ~equal): " << norm2(Umu_d) << " " << norm2(Umu_f) << std::endl;
    precisionChange(Umu_d_r, Umu_f);
    RealD normdiff = axpy_norm(Utmp_d, -1.0, Umu_d_r, Umu_d);
    std::cout << GridLogMessage << "Norm of difference of back-converted double-prec gauge fields (should be ~0) = " << normdiff << std::endl;
  }
  //Fermion fields
  LatticeFermionD psi_d(FGrid_d);
  LatticeFermionF psi_f(FGrid_f);
  LatticeFermionD psi_d_r(FGrid_d);
  LatticeFermionD psi_tmp_d(FGrid_d);
  for(int i=0;i<5;i++){
    random(RNG5, psi_d);
    precisionChange(psi_f, psi_d);
    std::cout << GridLogMessage << "Norm of double-prec and single-prec fermion fields (should be ~equal): " << norm2(psi_d) << " " << norm2(psi_f) << std::endl;
    precisionChange(psi_d_r, psi_f);
    RealD normdiff = axpy_norm(psi_tmp_d, -1.0, psi_d_r, psi_d);
    std::cout << GridLogMessage << "Norm of difference of back-converted double-prec fermion fields (should be ~0)= " << normdiff << std::endl;
  }
  //Checkerboarded fermion fields
  LatticeFermionD psi_cb_d(FrbGrid_d);
  LatticeFermionF psi_cb_f(FrbGrid_f);
  LatticeFermionD psi_cb_d_r(FrbGrid_d);
  LatticeFermionD psi_cb_tmp_d(FrbGrid_d);
  for(int i=0;i<5;i++){
    random(RNG5, psi_d);
    pickCheckerboard(Odd, psi_cb_d, psi_d);
    precisionChange(psi_cb_f, psi_cb_d);
    std::cout << GridLogMessage << "Norm of odd-cb double-prec and single-prec fermion fields (should be ~equal): " << norm2(psi_cb_d) << " " << norm2(psi_cb_f) << std::endl;
    precisionChange(psi_cb_d_r, psi_cb_f);
    RealD normdiff = axpy_norm(psi_cb_tmp_d, -1.0, psi_cb_d_r, psi_cb_d);
    std::cout << GridLogMessage << "Norm of difference of back-converted odd-cb double-prec fermion fields (should be ~0)= " << normdiff << std::endl;
    pickCheckerboard(Even, psi_cb_d, psi_d);
    precisionChange(psi_cb_f, psi_cb_d);
    std::cout << GridLogMessage << "Norm of even-cb double-prec and single-prec fermion fields (should be ~equal): " << norm2(psi_cb_d) << " " << norm2(psi_cb_f) << std::endl;
    precisionChange(psi_cb_d_r, psi_cb_f);
    normdiff = axpy_norm(psi_cb_tmp_d, -1.0, psi_cb_d_r, psi_cb_d);
    std::cout << GridLogMessage << "Norm of difference of back-converted even-cb double-prec fermion fields (should be ~0)= " << normdiff << std::endl;
  }
  Grid_finalize();
 }
--- a/Show More
+++ b/Show More