From 249b6e61ececb954c269574b5c9c99385e62443e Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Thu, 17 Dec 2020 14:09:00 -0500
Subject: [PATCH 01/41] For G-parity BCs the Nd-1 direction is now assumed to
 be the time direction and setting a twist in this direction will apply
 antiperiodic BCs Added option to run Test_gparity with antiperiodic time BCs

---
 Grid/qcd/action/fermion/GparityWilsonImpl.h | 74 +++++++++++++++++++--
 tests/core/Test_gparity.cc                  | 34 +++++++---
 2 files changed, 92 insertions(+), 16 deletions(-)
diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h
index 9dca403b..dd5801a9 100644
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,6 +30,19 @@ directory
 
 NAMESPACE_BEGIN(Grid);
 
+
+/*
+  Policy implementation for G-parity boundary conditions
+
+  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
+  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
+  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
+  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
+  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
+
+  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
+ */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +126,7 @@ public:
     || ((distance== 1)&&(icoor[direction]==1))
     || ((distance==-1)&&(icoor[direction]==0));
 
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
 
     //Apply the links
     int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +152,10 @@ public:
     assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
     assert((sl == 1) || (sl == 2));
 
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-
+    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
       if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 
 	extract(chi,vals);
@@ -197,6 +210,19 @@ public:
     reg = memory;
   }
 
+
+  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
+  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
+    autoView(poke_f0_v, poke_f0, CpuRead);
+    autoView(poke_f1_v, poke_f1, CpuRead);
+    autoView(Uds_v, Uds, CpuWrite);
+    thread_foreach(ss,poke_f0_v,{
+	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
+	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
+      });
+  }
+    
+
   inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
   {
     conformable(Uds.Grid(),GaugeGrid);
@@ -207,14 +233,16 @@ public:
     GaugeLinkField Uconj(GaugeGrid);
    
     Lattice<iScalar<vInteger> > coor(GaugeGrid);
-        
-    for(int mu=0;mu<Nd;mu++){
-          
+
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+    for(int mu=0;mu<Nd-1;mu++){
       LatticeCoordinate(coor,mu);
           
       U     = PeekIndex<LorentzIndex>(Umu,mu);
       Uconj = conjugate(U);
      
+      // Implement the isospin rotation sign on the boundary between f=1 and f=0
       // This phase could come from a simple bc 1,1,-1,1 ..
       int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
       if ( Params.twists[mu] ) { 
@@ -260,6 +288,38 @@ public:
         });
       }
     }
+
+    { //periodic / antiperiodic temporal BCs
+      int mu = Nd-1;
+      int L   = GaugeGrid->GlobalDimensions()[mu];
+      int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
+      
+      GaugeLinkField *Upoke = &U;
+
+      if(Params.twists[mu]){ //antiperiodic
+	Utmp =  where(coor == Lmu, -U, U);
+	Upoke = &Utmp;
+      }
+    
+      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
+
+      //Get the barrel-shifted field
+      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
+      Upoke = &Utmp;
+
+      if(Params.twists[mu]){
+	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
+	Upoke = &U;
+      }
+      
+      Uconj = conjugate(*Upoke);
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
+    }
   }
       
   inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
diff --git a/tests/core/Test_gparity.cc b/tests/core/Test_gparity.cc
index b2068901..026989f1 100644
--- a/tests/core/Test_gparity.cc
+++ b/tests/core/Test_gparity.cc
@@ -55,13 +55,17 @@ static_assert(same_vComplex == 1, "Dirac Operators must have same underlying SIM
 int main (int argc, char ** argv)
 {
   int nu = 0;
-
+  int tbc_aprd = 0; //use antiperiodic BCs in the time direction?
+  
   Grid_init(&argc,&argv);
 
   for(int i=1;i<argc;i++){
     if(std::string(argv[i]) == "--Gparity-dir"){
       std::stringstream ss; ss << argv[i+1]; ss >> nu;
       std::cout << GridLogMessage << "Set Gparity direction to " << nu << std::endl;
+    }else if(std::string(argv[i]) == "--Tbc-APRD"){
+      tbc_aprd = 1;
+      std::cout << GridLogMessage << "Using antiperiodic BCs in the time direction" << std::endl;
     }
   }
 
@@ -161,7 +165,12 @@ int main (int argc, char ** argv)
 
   RealD mass=0.0;
   RealD M5=1.8;
-  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS);
+
+  //Standard Dirac op
+  AcceleratorVector<Complex,4> bc_std(Nd, 1.0);
+  if(tbc_aprd) bc_std[Nd-1] = -1.; //antiperiodic time BC
+  StandardDiracOp::ImplParams std_params(bc_std);
+  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS, std_params);
 
   StandardFermionField    src_o_1f(FrbGrid_1f);
   StandardFermionField result_o_1f(FrbGrid_1f);
@@ -172,9 +181,11 @@ int main (int argc, char ** argv)
   ConjugateGradient<StandardFermionField> CG(1.0e-8,10000);
   CG(HermOpEO,src_o_1f,result_o_1f);
   
-  //  const int nu = 3;
+  //Gparity Dirac op
   std::vector<int> twists(Nd,0);
   twists[nu] = 1;
+  if(tbc_aprd) twists[Nd-1] = 1;
+
   GparityDiracOp::ImplParams params;
   params.twists = twists;
   GparityDiracOp GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5 DOP_PARAMS,params);
@@ -271,8 +282,11 @@ int main (int argc, char ** argv)
   std::cout << "2f cb "<<result_o_2f.Checkerboard()<<std::endl;
   std::cout << "1f cb "<<result_o_1f.Checkerboard()<<std::endl;
 
-  std::cout << " result norms " <<norm2(result_o_2f)<<" " <<norm2(result_o_1f)<<std::endl;
+  //Compare norms
+  std::cout << " result norms 2f: " <<norm2(result_o_2f)<<" 1f: " <<norm2(result_o_1f)<<std::endl;
 
+
+  //Take the 2f solution and convert into the corresponding 1f solution (odd cb only)
   StandardFermionField    res0o  (FrbGrid_2f); 
   StandardFermionField    res1o  (FrbGrid_2f); 
   StandardFermionField    res0  (FGrid_2f); 
@@ -281,14 +295,15 @@ int main (int argc, char ** argv)
   res0=Zero();
   res1=Zero();
 
-  res0o = PeekIndex<0>(result_o_2f,0);
-  res1o = PeekIndex<0>(result_o_2f,1);
+  res0o = PeekIndex<0>(result_o_2f,0); //flavor 0, odd cb
+  res1o = PeekIndex<0>(result_o_2f,1); //flavor 1, odd cb
 
   std::cout << "res cb "<<res0o.Checkerboard()<<std::endl;
   std::cout << "res cb "<<res1o.Checkerboard()<<std::endl;
 
-  setCheckerboard(res0,res0o);
-  setCheckerboard(res1,res1o);
+  //poke odd onto non-cb field
+  setCheckerboard(res0,res0o); 
+  setCheckerboard(res1,res1o); 
 
   StandardFermionField replica (FGrid_1f);
   StandardFermionField replica0(FGrid_1f);
@@ -296,12 +311,13 @@ int main (int argc, char ** argv)
   Replicate(res0,replica0);
   Replicate(res1,replica1);
 
+  //2nd half of doubled lattice has f=1
   replica = where( xcoor_1f5 >= Integer(L), replica1,replica0 );
 
   replica0 = Zero();
   setCheckerboard(replica0,result_o_1f);
 
-  std::cout << "Norm2 solutions is " <<norm2(replica)<<" "<< norm2(replica0)<<std::endl;
+  std::cout << "Norm2 solutions 1f reconstructed from 2f: " <<norm2(replica)<<" Actual 1f: "<< norm2(replica0)<<std::endl;
 
   replica = replica - replica0;
   

From a0ca3626904d6d98992a0dda3b914f5d7505a205 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Thu, 17 Dec 2020 16:21:58 -0500
Subject: [PATCH 02/41] Added an RHMC pseudofermion action,
 GeneralEvenOddRatioRationalPseudoFermionAction, that works for an arbitrary
 fractional power, not just a square root Added a test evolution for the
 above, Test_rhmc_EOWilsonRatioPowQuarter, demonstrating conservation of
 Hamiltonian Fixed HMC ignoring the MetropolisTest parameter of HMCparameters

---
 Grid/qcd/action/ActionParams.h                |  41 ++-
 Grid/qcd/action/pseudofermion/Bounds.h        |  51 +++
 .../GeneralEvenOddRationalRatio.h             | 304 ++++++++++++++++++
 Grid/qcd/action/pseudofermion/PseudoFermion.h |   1 +
 Grid/qcd/hmc/HMC.h                            |   2 +-
 .../hmc/Test_rhmc_EOWilsonRatioPowQuarter.cc  | 139 ++++++++
 6 files changed, 536 insertions(+), 2 deletions(-)
 create mode 100644 Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
 create mode 100644 tests/hmc/Test_rhmc_EOWilsonRatioPowQuarter.cc

diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h
index 0e6a11c6..6374256e 100644
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
 
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
   GparityWilsonImplParams() : twists(Nd, 0) {};
 };
   
@@ -85,6 +86,44 @@ struct StaggeredImplParams {
         precision(_precision),
         BoundsCheckFreq(_BoundsCheckFreq){};
   };
+
+
+  /*Action parameters for the generalized rational action
+    The approximation is for (M^dag M)^{1/inv_pow}
+    where inv_pow is the denominator of the fractional power.
+    Default inv_pow=2 for square root, making this equivalent to 
+    the OneFlavourRational action
+  */
+    struct RationalActionParams : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
+				    int, inv_pow, 
+				    RealD, lo, 
+				    RealD, hi, 
+				    int,   MaxIter, 
+				    RealD, tolerance, 
+				    int,   degree, 
+				    int,   precision,
+				    int,   BoundsCheckFreq);
+  // constructor 
+  RationalActionParams(int _inv_pow = 2,
+		       RealD _lo      = 0.0, 
+		       RealD _hi      = 1.0, 
+		       int _maxit     = 1000,
+		       RealD tol      = 1.0e-8, 
+		       int _degree    = 10,
+		       int _precision = 64,
+		       int _BoundsCheckFreq=20)
+    : inv_pow(_inv_pow), 
+      lo(_lo),
+      hi(_hi),
+      MaxIter(_maxit),
+      tolerance(tol),
+      degree(_degree),
+      precision(_precision),
+      BoundsCheckFreq(_BoundsCheckFreq){};
+  };
+
+
   
 NAMESPACE_END(Grid);
 
diff --git a/Grid/qcd/action/pseudofermion/Bounds.h b/Grid/qcd/action/pseudofermion/Bounds.h
index 535e1a49..c428d5ee 100644
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@@ -48,5 +48,56 @@ NAMESPACE_BEGIN(Grid);
       assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
     }
 
+    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
+       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
+       for noise X (aka GaussNoise).
+       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
+    */
+    template<class Field> void InversePowerBoundsCheck(int inv_pow,
+						       int MaxIter,double tol,
+						       LinearOperatorBase<Field> &HermOp,
+						       Field &GaussNoise,
+						       MultiShiftFunction &ApproxNegPow) 
+    {
+      GridBase *FermionGrid = GaussNoise.Grid();
+
+      Field X(FermionGrid);
+      Field Y(FermionGrid);
+      Field Z(FermionGrid);
+
+      Field tmp1(FermionGrid), tmp2(FermionGrid);
+
+      X=GaussNoise;
+      RealD Nx = norm2(X);
+
+      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
+
+      tmp1 = X;
+      
+      Field* in = &tmp1;
+      Field* out = &tmp2;
+      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
+	msCG(HermOp, *in, *out); //backwards conventions!
+	if(i!=inv_pow-1) std::swap(in, out);
+      }
+      Z = *out;
+
+      RealD Nz = norm2(Z);
+
+      HermOp.HermOp(Z,Y);
+      RealD Ny = norm2(Y);
+
+      X=X-Y;
+      RealD Nd = norm2(X);
+      std::cout << "************************* "<<std::endl;
+      std::cout << " noise                         = "<<Nx<<std::endl;
+      std::cout << " (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise         = "<<Nz<<std::endl;
+      std::cout << " MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise   = "<<Ny<<std::endl;
+      std::cout << " noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise   = "<<Nd<<std::endl;
+      std::cout << "************************* "<<std::endl;
+      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
+    }
+
+
 NAMESPACE_END(Grid);
 
diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
new file mode 100644
index 00000000..ecc140b4
--- /dev/null
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -0,0 +1,304 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators
+    /////////////////////////////////////////////////////////
+
+    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
+           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
+	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
+	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{-1/(2*inv_pow)} chi\
+
+	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    
+       BIG WARNING:	   
+       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
+       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
+       Thus for DWF the numerator operator is the Pauli-Villars operator
+
+       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
+       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
+    */
+      
+    template<class Impl>
+    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef RationalActionParams Params;
+      Params param;
+
+      MultiShiftFunction ApproxPower   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPower;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPower;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPower; //rational approx for X^{-1/(2*inv_pow)}
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+
+    public:
+
+      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+						     FermionOperator<Impl>  &_DenOp, 
+						     Params & p
+						     ) : 
+	NumOp(_NumOp), 
+	DenOp(_DenOp), 
+	PhiOdd (_NumOp.FermionRedBlackGrid()),
+	PhiEven(_NumOp.FermionRedBlackGrid()),
+	param(p) 
+      {
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	int inv_pow = param.inv_pow;
+	int _2_inv_pow = 2*inv_pow;
+
+	// MdagM^(+- 1/inv_pow)
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/" << inv_pow << ")"<<std::endl;
+	remez.generateApprox(param.degree,1,inv_pow);
+	ApproxPower.Init(remez,param.tolerance,false);
+	ApproxNegPower.Init(remez,param.tolerance,true);
+
+	// VdagV^(+- 1/(2*inv_pow))
+	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/" << _2_inv_pow << ")"<<std::endl;
+	remez.generateApprox(param.degree,1,_2_inv_pow);
+   	ApproxHalfPower.Init(remez,param.tolerance,false);
+	ApproxNegHalfPower.Init(remez,param.tolerance,true);
+      };
+
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] Power          : 1/" << param.inv_pow <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
+	return sstream.str();
+      }
+      
+      
+      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
+	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
+	//
+	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
+	//
+	// P(eta) = e^{- eta^dag eta}
+	//	
+	// General gaussian random draws from   e^{x^2/(2 sig^2)} => sig^2 = 0.5.
+	// 
+	// So eta should be of width sig = 1/sqrt(2).
+
+	RealD scale = std::sqrt(0.5);
+
+	FermionField eta(NumOp.FermionGrid());
+	FermionField etaOdd (NumOp.FermionRedBlackGrid());
+	FermionField etaEven(NumOp.FermionRedBlackGrid());
+	FermionField     tmp(NumOp.FermionRedBlackGrid());
+
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+
+	// MdagM^1/(2*inv_pow) eta
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxHalfPower);
+	msCG_M(MdagM,etaOdd,tmp);
+
+	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxNegHalfPower);
+	msCG_V(VdagV,tmp,PhiOdd);
+
+	assert(NumOp.ConstEE() == 1);
+	assert(DenOp.ConstEE() == 1);
+	PhiEven = Zero();
+	
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	FermionField X(NumOp.FermionRedBlackGrid());
+	FermionField Y(NumOp.FermionRedBlackGrid());
+
+	// VdagV^1/(2*inv_pow) Phi
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPower);
+	msCG_V(VdagV,PhiOdd,X);
+
+	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegHalfPower);
+	msCG_M(MdagM,X,Y);
+
+	// Randomly apply rational bounds checks.
+	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	  FermionField gauss(NumOp.FermionRedBlackGrid());
+	  gauss = PhiOdd;
+	  HighBoundCheck(MdagM,gauss,param.hi);
+
+	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.tolerance*100,MdagM,gauss,ApproxNegPower);
+	}
+
+	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	RealD action = norm2(Y);
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+	const int n_f  = ApproxNegPower.poles.size();
+	const int n_pv = ApproxHalfPower.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
+
+	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField           Y(NumOp.FermionRedBlackGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPower);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegPower);
+
+	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
+	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
+	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+
+	RealD ak;
+
+	dSdU = Zero();
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)
+	for(int k=0;k<n_f;k++){
+	  ak = ApproxNegPower.residues[k];
+	  MdagM.Mpc(MfMpvPhi_k[k],Y);
+	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
+	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	for(int k=0;k<n_pv;k++){
+
+          ak = ApproxHalfPower.residues[k];
+	  
+	  VdagV.Mpc(MpvPhi_k[k],Y);
+	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
+	  
+	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
+
+	}
+
+	//dSdU = Ta(dSdU);
+
+      };
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
diff --git a/Grid/qcd/action/pseudofermion/PseudoFermion.h b/Grid/qcd/action/pseudofermion/PseudoFermion.h
index 6674bdd6..2abcb216 100644
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@@ -41,6 +41,7 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
 
 #endif
diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h
index f168b69a..9bf5261b 100644
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -207,7 +207,7 @@ public:
       DeltaH = evolve_hmc_step(Ucopy);
       // Metropolis-Hastings test
       bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
         accept = metropolis_test(DeltaH);
       } else {
       	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
diff --git a/tests/hmc/Test_rhmc_EOWilsonRatioPowQuarter.cc b/tests/hmc/Test_rhmc_EOWilsonRatioPowQuarter.cc
new file mode 100644
index 00000000..3618e422
--- /dev/null
+++ b/tests/hmc/Test_rhmc_EOWilsonRatioPowQuarter.cc
@@ -0,0 +1,139 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_rhmc_EOWilsonRatio.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+//This test is for the Wilson action with the determinant det( M^dag M)^1/4
+//testing the generic RHMC 
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+   ;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  // Uses the default minimum norm
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams;  
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+  
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes 
+  // that have a complex construction
+  // standard
+  RealD beta = 5.6 ;
+  WilsonGaugeActionR Waction(beta);
+    
+  auto GridPtr = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+
+  Real mass = -0.77;
+  Real pv   = 0.0;
+
+  // Can we define an overloaded operator that does not need U and initialises
+  // it with zeroes?
+  FermionAction DenOp(U, *GridPtr, *GridRBPtr, mass);
+  FermionAction NumOp(U, *GridPtr, *GridRBPtr, pv);
+
+
+  // 1/2+1/2 flavour
+  // RationalActionParams(int _inv_pow = 2,
+  // 		       RealD _lo      = 0.0, 
+  // 		       RealD _hi      = 1.0, 
+  // 		       int _maxit     = 1000,
+  // 		       RealD tol      = 1.0e-8, 
+  // 		       int _degree    = 10,
+  // 		       int _precision = 64,
+  // 		       int _BoundsCheckFreq=20)
+
+
+  int inv_pow = 4;
+  RationalActionParams Params(inv_pow,1.0e-2,64.0,1000,1.0e-6,14,64,1);
+
+  GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> RHMC(NumOp,DenOp,Params);
+
+    // Collect actions
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  Level1.push_back(&RHMC);
+
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  /////////////////////////////////////////////////////////////
+
+  // HMC parameters are serialisable 
+  TheHMC.Parameters.MD.MDsteps = 20;
+  TheHMC.Parameters.MD.trajL   = 1.0;
+
+  TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  TheHMC.Run();
+
+  Grid_finalize();
+
+} // main
+
+
+
+
+

From ba5dc670a53d3cb0cf140d905bc5b8e97512880c Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Tue, 22 Dec 2020 10:10:07 -0500
Subject: [PATCH 03/41] Reimplemented GparityWilsonImpl::InsertForce5D to run
 efficiently on GPUs Swapped order of templated tensor code and c-number
 specializations in Tensor_outer.h to fix compile issue with type deduction on
 Summit

---
 Grid/qcd/action/fermion/GparityWilsonImpl.h | 50 +++++++++++++--------
 Grid/tensors/Tensor_outer.h                 | 21 ++++-----
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h
index dd5801a9..f8ae7a9f 100644
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,7 +30,6 @@ directory
 
 NAMESPACE_BEGIN(Grid);
 
-
 /*
   Policy implementation for G-parity boundary conditions
 
@@ -358,28 +357,41 @@ public:
   inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
     assert(0);
   }
-  
+ 
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
-    int Ls = Btilde.Grid()->_fdimensions[0];
-        
-    GaugeLinkField tmp(mat.Grid());
-    tmp = Zero();
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    
     {
-      autoView( tmp_v , tmp, CpuWrite);
-      autoView( Atilde_v , Atilde, CpuRead);
-      autoView( Btilde_v , Btilde, CpuRead);
-      thread_for(ss,tmp.Grid()->oSites(),{
-	  for (int s = 0; s < Ls; s++) {
-	    int sF = s + Ls * ss;
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
-	  }
-	});
+      autoView( mat_v , mat, AcceleratorWrite);
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
+  	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+  	  zeroit(sum);
+  	  for(int s=0;s<Ls;s++){
+  	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+  	      //Flavor 0
+  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
+  	      sum = sum + outerProduct(bb,aa);
+
+  	      //Flavor 1
+  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
+  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
+  	      sum = sum + conjugate(outerProduct(bb,aa));
+  	    }
+  	  }	    
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+  	});
     }
-    PokeIndex<LorentzIndex>(mat, tmp, mu);
-    return;
   }
+
+
+  
+
   
 };
 
diff --git a/Grid/tensors/Tensor_outer.h b/Grid/tensors/Tensor_outer.h
index 4902c22f..0fad84b1 100644
--- a/Grid/tensors/Tensor_outer.h
+++ b/Grid/tensors/Tensor_outer.h
@@ -35,6 +35,17 @@ NAMESPACE_BEGIN(Grid);
 //              Vector x Vector -> Matrix
 ///////////////////////////////////////////////////////////////////////////////////////
 
+template<class CC,IfComplex<CC> = 0>
+accelerator_inline CC outerProduct(const CC &l, const CC& r)
+{
+  return l*conj(r);
+}
+template<class RR,IfReal<RR> = 0>
+accelerator_inline RR outerProduct(const RR &l, const RR& r)
+{
+  return l*r;
+}
+
 template<class l,class r,int N> accelerator_inline
 auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
 {
@@ -57,16 +68,6 @@ auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<declt
   return ret;
 }
 
-template<class CC,IfComplex<CC> = 0>
-accelerator_inline CC outerProduct(const CC &l, const CC& r)
-{
-  return l*conj(r);
-}
-template<class RR,IfReal<RR> = 0>
-accelerator_inline RR outerProduct(const RR &l, const RR& r)
-{
-  return l*r;
-}
 
 NAMESPACE_END(Grid);
 

From 220ad5e3ee8bf39e8bd403570914d600ff3e4c81 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Tue, 22 Dec 2020 11:08:22 -0500
Subject: [PATCH 04/41] Added more verbose log output to
 GeneralEvenOddRatioRationalPseudoFermionAction In
 GeneralEvenOddRatioRationalPseudoFermionAction, setting the bounds check
 frequency to 0 now disables the check

---
 .../GeneralEvenOddRationalRatio.h             | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index ecc140b4..8561bd36 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -84,6 +84,7 @@ NAMESPACE_BEGIN(Grid);
 	PhiEven(_NumOp.FermionRedBlackGrid()),
 	param(p) 
       {
+	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
 	AlgRemez remez(param.lo,param.hi,param.precision);
 
 	int inv_pow = param.inv_pow;
@@ -100,6 +101,7 @@ NAMESPACE_BEGIN(Grid);
 	remez.generateApprox(param.degree,1,_2_inv_pow);
    	ApproxHalfPower.Init(remez,param.tolerance,false);
 	ApproxNegHalfPower.Init(remez,param.tolerance,true);
+	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
       };
 
       virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
@@ -132,6 +134,7 @@ NAMESPACE_BEGIN(Grid);
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	RealD scale = std::sqrt(0.5);
 
 	FermionField eta(NumOp.FermionGrid());
@@ -149,11 +152,13 @@ NAMESPACE_BEGIN(Grid);
 
 
 	// MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxHalfPower);
 	msCG_M(MdagM,etaOdd,tmp);
 
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxNegHalfPower);
 	msCG_V(VdagV,tmp,PhiOdd);
@@ -161,14 +166,14 @@ NAMESPACE_BEGIN(Grid);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
-	
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
       };
 
       //////////////////////////////////////////////////////
       // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
       //////////////////////////////////////////////////////
       virtual RealD S(const GaugeField &U) {
-
+	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 
@@ -176,26 +181,29 @@ NAMESPACE_BEGIN(Grid);
 	FermionField Y(NumOp.FermionRedBlackGrid());
 
 	// VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPower);
 	msCG_V(VdagV,PhiOdd,X);
 
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegHalfPower);
 	msCG_M(MdagM,X,Y);
 
 	// Randomly apply rational bounds checks.
-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	if ( param.BoundsCheckFreq != 0 && (rand()%param.BoundsCheckFreq)==0 ) { 
+	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
-
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.tolerance*100,MdagM,gauss,ApproxNegPower);
 	}
 
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	RealD action = norm2(Y);
+	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
 
 	return action;
       };
@@ -231,7 +239,7 @@ NAMESPACE_BEGIN(Grid);
       //  
 
       virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
+	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
 	const int n_f  = ApproxNegPower.poles.size();
 	const int n_pv = ApproxHalfPower.poles.size();
 
@@ -255,8 +263,11 @@ NAMESPACE_BEGIN(Grid);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPower);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegPower);
 
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 
 	RealD ak;
@@ -270,7 +281,8 @@ NAMESPACE_BEGIN(Grid);
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 
-	//(1)
+	//(1)	
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
 	for(int k=0;k<n_f;k++){
 	  ak = ApproxNegPower.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
@@ -280,6 +292,7 @@ NAMESPACE_BEGIN(Grid);
 	
 	//(2)
 	//(3)
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
 	for(int k=0;k<n_pv;k++){
 
           ak = ApproxHalfPower.residues[k];
@@ -295,7 +308,7 @@ NAMESPACE_BEGIN(Grid);
 	}
 
 	//dSdU = Ta(dSdU);
-
+	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
       };
     };
 

From 75c6c6b1732c36104421c2c9bdde79227e7967e4 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 23 Dec 2020 11:19:26 -0500
Subject: [PATCH 05/41] General RHMC pseudofermion action now allows for
 different rational approximations to be used in the MD and action evaluation

---
 Grid/qcd/action/ActionParams.h                |  28 +++--
 .../GeneralEvenOddRationalRatio.h             | 101 ++++++++++++------
 2 files changed, 88 insertions(+), 41 deletions(-)

diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h
index 6374256e..12d2b91b 100644
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -97,28 +97,34 @@ struct StaggeredImplParams {
     struct RationalActionParams : Serializable {
     GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
-				    RealD, lo, 
-				    RealD, hi, 
-				    int,   MaxIter, 
-				    RealD, tolerance, 
-				    int,   degree, 
-				    int,   precision,
-				    int,   BoundsCheckFreq);
+				    RealD, lo, //low eigenvalue bound of rational approx
+				    RealD, hi, //high eigenvalue bound of rational approx
+				    int,   MaxIter,  //maximum iterations in msCG
+				    RealD, action_tolerance,  //msCG tolerance in action evaluation
+				    int,   action_degree, //rational approx tolerance in action evaluation
+				    RealD, md_tolerance,  //msCG tolerance in MD integration
+				    int,   md_degree, //rational approx tolerance in MD integration
+				    int,   precision, //precision of floating point arithmetic
+				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
   // constructor 
   RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
-		       RealD tol      = 1.0e-8, 
-		       int _degree    = 10,
+		       RealD _action_tolerance      = 1.0e-8, 
+		       int _action_degree    = 10,
+		       RealD _md_tolerance      = 1.0e-8, 
+		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
     : inv_pow(_inv_pow), 
       lo(_lo),
       hi(_hi),
       MaxIter(_maxit),
-      tolerance(tol),
-      degree(_degree),
+      action_tolerance(_action_tolerance),
+      action_degree(_action_degree),
+      md_tolerance(_md_tolerance),
+      md_degree(_md_degree),
       precision(_precision),
       BoundsCheckFreq(_BoundsCheckFreq){};
   };
diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index 8561bd36..af64fba7 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -60,10 +60,17 @@ NAMESPACE_BEGIN(Grid);
       typedef RationalActionParams Params;
       Params param;
 
-      MultiShiftFunction ApproxPower   ;  //rational approx for X^{1/inv_pow}
-      MultiShiftFunction ApproxNegPower;  //rational approx for X^{-1/inv_pow}
-      MultiShiftFunction ApproxHalfPower;   //rational approx for X^{1/(2*inv_pow)}
-      MultiShiftFunction ApproxNegHalfPower; //rational approx for X^{-1/(2*inv_pow)}
+      //For action evaluation
+      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
+
+      //For the MD integration
+      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
 
     private:
      
@@ -90,17 +97,49 @@ NAMESPACE_BEGIN(Grid);
 	int inv_pow = param.inv_pow;
 	int _2_inv_pow = 2*inv_pow;
 
+	//Generate approximations for action eval
+
 	// MdagM^(+- 1/inv_pow)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/" << inv_pow << ")"<<std::endl;
-	remez.generateApprox(param.degree,1,inv_pow);
-	ApproxPower.Init(remez,param.tolerance,false);
-	ApproxNegPower.Init(remez,param.tolerance,true);
+	std::cout<<GridLogMessage << "Generating degree "<<param.action_degree<<" and tolerance " << param.action_tolerance << " for x^(1/" << inv_pow << ")"<<std::endl;
+	remez.generateApprox(param.action_degree,1,inv_pow);
+	ApproxPowerAction.Init(remez,param.action_tolerance,false);
+	ApproxNegPowerAction.Init(remez,param.action_tolerance,true);
 
 	// VdagV^(+- 1/(2*inv_pow))
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/" << _2_inv_pow << ")"<<std::endl;
-	remez.generateApprox(param.degree,1,_2_inv_pow);
-   	ApproxHalfPower.Init(remez,param.tolerance,false);
-	ApproxNegHalfPower.Init(remez,param.tolerance,true);
+	std::cout<<GridLogMessage << "Generating degree "<<param.action_degree<<" and tolerance " << param.action_tolerance <<" for x^(1/" << _2_inv_pow << ")"<<std::endl;
+	remez.generateApprox(param.action_degree,1,_2_inv_pow);
+   	ApproxHalfPowerAction.Init(remez,param.action_tolerance,false);
+	ApproxNegHalfPowerAction.Init(remez,param.action_tolerance,true);
+
+	//Generate approximations for MD
+	if(param.md_degree != param.action_degree || 
+	   param.md_tolerance < param.action_tolerance  //no point in finding less precise polynomial if the degree is the same
+	   ){
+	  // MdagM^(+- 1/inv_pow)
+	  std::cout<<GridLogMessage << "Generating degree "<<param.md_degree<<" and tolerance " << param.md_tolerance <<" for x^(1/" << inv_pow << ")"<<std::endl;
+	  remez.generateApprox(param.md_degree,1,inv_pow);
+	  ApproxPowerMD.Init(remez,param.md_tolerance,false);
+	  ApproxNegPowerMD.Init(remez,param.md_tolerance,true);
+
+	  // VdagV^(+- 1/(2*inv_pow))
+	  std::cout<<GridLogMessage << "Generating degree "<<param.md_degree<<" and tolerance " << param.md_tolerance <<" for x^(1/" << _2_inv_pow << ")"<<std::endl;
+	  remez.generateApprox(param.md_degree,1,_2_inv_pow);
+	  ApproxHalfPowerMD.Init(remez,param.md_tolerance,false);
+	  ApproxNegHalfPowerMD.Init(remez,param.md_tolerance,true);
+	}else{
+	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
+	  ApproxPowerMD = ApproxPowerAction; 
+	  ApproxNegPowerMD = ApproxNegPowerAction;
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
+
+	  ApproxHalfPowerMD = ApproxHalfPowerAction;
+	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
+
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
+	}
+
 	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
       };
 
@@ -108,13 +147,15 @@ NAMESPACE_BEGIN(Grid);
 
       virtual std::string LogParameters(){
 	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] Power          : 1/" << param.inv_pow <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
 	return sstream.str();
       }
       
@@ -154,13 +195,13 @@ NAMESPACE_BEGIN(Grid);
 	// MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxHalfPower);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxHalfPowerAction);
 	msCG_M(MdagM,etaOdd,tmp);
 
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxNegHalfPower);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxNegHalfPowerAction);
 	msCG_V(VdagV,tmp,PhiOdd);
 
 	assert(NumOp.ConstEE() == 1);
@@ -183,13 +224,13 @@ NAMESPACE_BEGIN(Grid);
 	// VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPower);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPowerAction);
 	msCG_V(VdagV,PhiOdd,X);
 
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegHalfPower);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegHalfPowerAction);
 	msCG_M(MdagM,X,Y);
 
 	// Randomly apply rational bounds checks.
@@ -198,7 +239,7 @@ NAMESPACE_BEGIN(Grid);
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
-	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.tolerance*100,MdagM,gauss,ApproxNegPower);
+	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
 	}
 
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
@@ -240,8 +281,8 @@ NAMESPACE_BEGIN(Grid);
 
       virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
-	const int n_f  = ApproxNegPower.poles.size();
-	const int n_pv = ApproxHalfPower.poles.size();
+	const int n_f  = ApproxNegPowerMD.poles.size();
+	const int n_pv = ApproxHalfPowerMD.poles.size();
 
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
@@ -260,8 +301,8 @@ NAMESPACE_BEGIN(Grid);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPower);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegPower);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPowerMD);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegPowerMD);
 
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
@@ -284,7 +325,7 @@ NAMESPACE_BEGIN(Grid);
 	//(1)	
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
 	for(int k=0;k<n_f;k++){
-	  ak = ApproxNegPower.residues[k];
+	  ak = ApproxNegPowerMD.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
@@ -295,7 +336,7 @@ NAMESPACE_BEGIN(Grid);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
 	for(int k=0;k<n_pv;k++){
 
-          ak = ApproxHalfPower.residues[k];
+          ak = ApproxHalfPowerMD.residues[k];
 	  
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;

From 813d4cd9004ca1dcbfe426fcc7218fb23c8f347f Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 23 Dec 2020 16:01:42 -0500
Subject: [PATCH 06/41] Added test program that ensures the generic
 checkerboarded RHMC (with parameters set appropriately) gives the same answer
 as the existing 1f code

---
 ...t_rhmc_EOWilsonRatio_genericVsOneFlavor.cc | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 tests/hmc/Test_rhmc_EOWilsonRatio_genericVsOneFlavor.cc

diff --git a/tests/hmc/Test_rhmc_EOWilsonRatio_genericVsOneFlavor.cc b/tests/hmc/Test_rhmc_EOWilsonRatio_genericVsOneFlavor.cc
new file mode 100644
index 00000000..6ee0e7c5
--- /dev/null
+++ b/tests/hmc/Test_rhmc_EOWilsonRatio_genericVsOneFlavor.cc
@@ -0,0 +1,122 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_rhmc_EOWilsonRatio_genericVsOneFlavor.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+//This test ensures that the OneFlavourEvenOddRatioRationalPseudoFermionAction and GeneralEvenOddRatioRationalPseudoFermionAction action (with parameters set appropriately0
+//give the same results
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  // Uses the default minimum norm
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+
+  // // Checkpointer definition
+  // CheckpointerParameters CPparams;  
+  // CPparams.config_prefix = "ckpoint_lat";
+  // CPparams.rng_prefix = "ckpoint_rng";
+  // CPparams.saveInterval = 5;
+  // CPparams.format = "IEEE64BIG";
+  
+  // TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  auto GridPtr = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+
+  Real mass = -0.77;
+  Real pv   = 0.0;
+
+  FermionAction DenOp(U, *GridPtr, *GridRBPtr, mass);
+  FermionAction NumOp(U, *GridPtr, *GridRBPtr, pv);
+
+  TheHMC.Resources.AddRNGs();
+  PeriodicGimplR::HotConfiguration(TheHMC.Resources.GetParallelRNG(), U);
+
+  std::string seed_string = "the_seed";
+
+  //1f action
+  OneFlavourRationalParams OneFParams(1.0e-2,64.0,1000,1.0e-6,6); 
+
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> OneF(NumOp,DenOp,OneFParams);
+  TheHMC.Resources.GetParallelRNG().SeedUniqueString(seed_string);
+  OneF.refresh(U, TheHMC.Resources.GetParallelRNG());    
+  RealD OneFS = OneF.S(U);
+  LatticeGaugeField OneFderiv(U);
+  OneF.deriv(U,OneFderiv);    
+  
+  //general action
+  RationalActionParams GenParams;
+  GenParams.inv_pow = 2;
+  GenParams.lo = OneFParams.lo;
+  GenParams.hi = OneFParams.hi;
+  GenParams.MaxIter = OneFParams.MaxIter;
+  GenParams.action_tolerance = GenParams.md_tolerance = OneFParams.tolerance;
+  GenParams.action_degree = GenParams.md_degree = OneFParams.degree;
+  GenParams.precision = OneFParams.precision;
+  GenParams.BoundsCheckFreq = OneFParams.BoundsCheckFreq;
+
+  GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> Gen(NumOp,DenOp,GenParams);
+  TheHMC.Resources.GetParallelRNG().SeedUniqueString(seed_string);
+  Gen.refresh(U, TheHMC.Resources.GetParallelRNG());    
+  RealD GenS = Gen.S(U);
+  LatticeGaugeField Genderiv(U);
+  Gen.deriv(U,Genderiv);   
+
+
+  //Compare
+  std::cout << "Action : " << OneFS << " " << GenS << " reldiff " << (OneFS - GenS)/OneFS << std::endl;
+  
+  LatticeGaugeField diff(U);
+  axpy(diff, -1.0, Genderiv, OneFderiv);
+  std::cout << "Norm of difference in deriv " << sqrt(norm2(diff)) << std::endl;
+
+  Grid_finalize();
+  return 0;
+}
+

From d185f2eaa7c55811db517caa8c783cfee04364e5 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 23 Dec 2020 16:26:10 -0500
Subject: [PATCH 07/41] OneFlavourEvenOddRatioRationalPseudoFermionAction now
 derives from GeneralEvenOddRatioRationalPseudoFermionAction, simply performs
 transcription of parameters

---
 .../GeneralEvenOddRationalRatio.h             |   2 +-
 .../OneFlavourEvenOddRationalRatio.h          | 251 ++----------------
 Grid/qcd/action/pseudofermion/PseudoFermion.h |   2 +-
 3 files changed, 20 insertions(+), 235 deletions(-)

diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index af64fba7..a92b6f7d 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -83,7 +83,7 @@ NAMESPACE_BEGIN(Grid);
 
       GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						     FermionOperator<Impl>  &_DenOp, 
-						     Params & p
+						     const Params & p
 						     ) : 
 	NumOp(_NumOp), 
 	DenOp(_DenOp), 
diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
index e5f0b602..1b36ae0f 100644
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -40,246 +40,31 @@ NAMESPACE_BEGIN(Grid);
     // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
   
     template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
     public:
-
-      INHERIT_IMPL_TYPES(Impl);
-
       typedef OneFlavourRationalParams Params;
-      Params param;
-
-      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
-      MultiShiftFunction PowerQuarter;
-      MultiShiftFunction PowerNegQuarter;
-
     private:
-     
-      FermionOperator<Impl> & NumOp;// the basic operator
-      FermionOperator<Impl> & DenOp;// the basic operator
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+      static RationalActionParams transcribe(const Params &in){
+	RationalActionParams out;
+	out.inv_pow = 2;
+	out.lo = in.lo;
+	out.hi = in.hi;
+	out.MaxIter = in.MaxIter;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
+	out.action_degree = out.md_degree = in.degree;
+	out.precision = in.precision;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
+	return out;
+      }
 
     public:
-
       OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-					    FermionOperator<Impl>  &_DenOp, 
-					    Params & p
-					    ) : 
-      NumOp(_NumOp), 
-      DenOp(_DenOp), 
-      PhiOdd (_NumOp.FermionRedBlackGrid()),
-      PhiEven(_NumOp.FermionRedBlackGrid()),
-      param(p) 
-      {
-	AlgRemez remez(param.lo,param.hi,param.precision);
+							FermionOperator<Impl>  &_DenOp, 
+							const Params & p
+							) : 
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}
 
-	// MdagM^(+- 1/2)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-	remez.generateApprox(param.degree,1,2);
-	PowerHalf.Init(remez,param.tolerance,false);
-	PowerNegHalf.Init(remez,param.tolerance,true);
-
-	// MdagM^(+- 1/4)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
-	remez.generateApprox(param.degree,1,4);
-   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-      };
-
-      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
-
-      virtual std::string LogParameters(){
-	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
-	return sstream.str();
-      }
-      
-      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
-
-	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-	//
-	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
-	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
-	//
-	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
-
-	RealD scale = std::sqrt(0.5);
-
-	FermionField eta(NumOp.FermionGrid());
-	FermionField etaOdd (NumOp.FermionRedBlackGrid());
-	FermionField etaEven(NumOp.FermionRedBlackGrid());
-	FermionField     tmp(NumOp.FermionRedBlackGrid());
-
-	gaussian(pRNG,eta);	eta=eta*scale;
-
-	pickCheckerboard(Even,etaEven,eta);
-	pickCheckerboard(Odd,etaOdd,eta);
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-
-	// MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
-	msCG_M(MdagM,etaOdd,tmp);
-
-	// VdagV^-1/4 MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
-	msCG_V(VdagV,tmp,PhiOdd);
-
-	assert(NumOp.ConstEE() == 1);
-	assert(DenOp.ConstEE() == 1);
-	PhiEven = Zero();
-	
-      };
-
-      //////////////////////////////////////////////////////
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	FermionField X(NumOp.FermionRedBlackGrid());
-	FermionField Y(NumOp.FermionRedBlackGrid());
-
-	// VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	msCG_V(VdagV,PhiOdd,X);
-
-	// MdagM^-1/4 VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
-	msCG_M(MdagM,X,Y);
-
-	// Randomly apply rational bounds checks.
-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
-	  FermionField gauss(NumOp.FermionRedBlackGrid());
-	  gauss = PhiOdd;
-	  HighBoundCheck(MdagM,gauss,param.hi);
-	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
-	}
-
-	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
-	RealD action = norm2(Y);
-
-	return action;
-      };
-
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //
-      // Here, M is some 5D operator and V is the Pauli-Villars field
-      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
-      //
-      // Need  
-      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
-      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
-      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
-      //
-      // P/Q is expressed as partial fraction expansion: 
-      // 
-      //           a0 + \sum_k ak/(V^dagV + bk) 
-      //  
-      // d[P/Q] is then  
-      //
-      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
-      //  
-      // and similar for N/D. 
-      // 
-      // Need   
-      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
-      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
-      //   
-      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
-      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
-      // 
-      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
-      //  
-
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
-	const int n_f  = PowerNegHalf.poles.size();
-	const int n_pv = PowerQuarter.poles.size();
-
-	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
-
-	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField           Y(NumOp.FermionRedBlackGrid());
-
-	GaugeField   tmp(NumOp.GaugeGrid());
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
-
-	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
-	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
-	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
-
-	RealD ak;
-
-	dSdU = Zero();
-
-	// With these building blocks  
-	//  
-	//       dS/dU = 
-	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
-	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
-	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
-
-	//(1)
-	for(int k=0;k<n_f;k++){
-	  ak = PowerNegHalf.residues[k];
-	  MdagM.Mpc(MfMpvPhi_k[k],Y);
-	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
-	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
-	}
-	
-	//(2)
-	//(3)
-	for(int k=0;k<n_pv;k++){
-
-          ak = PowerQuarter.residues[k];
-	  
-	  VdagV.Mpc(MpvPhi_k[k],Y);
-	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
-	  
-	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
-	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
-
-	}
-
-	//dSdU = Ta(dSdU);
-
-      };
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
     };
 
 NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/pseudofermion/PseudoFermion.h b/Grid/qcd/action/pseudofermion/PseudoFermion.h
index 2abcb216..aa8c99a3 100644
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@@ -40,8 +40,8 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
-#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
 
 #endif

From d7a2a4852d1280d9b80b17cd827073d1c3558274 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 6 Jan 2021 09:30:49 -0500
Subject: [PATCH 08/41] Reimplemented precisionChange to run on GPUs. A
 workspace containing the mapping table can be optionally precomputed and
 reused for improved performance.

---
 Grid/lattice/Lattice_transfer.h | 136 +++++++++++++++++++++++---------
 1 file changed, 99 insertions(+), 37 deletions(-)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 91de721f..9af347e8 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -998,54 +998,116 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
   });
 }
 
-//Convert a Lattice from one precision to another
-template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-{
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
-  for(int d=0;d<out.Grid()->Nd();d++){
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
+class precisionChangeWorkspace{
+  std::pair<Integer,Integer>* fmap_device; //device pointer
+public:
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
+    assert(out_grid->Nd() == in_grid->Nd());
+    for(int d=0;d<out_grid->Nd();d++){
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
+    }
+    int Nsimd_out = out_grid->Nsimd();
+
+    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
+    for(int lane=0; lane < out_grid->Nsimd(); lane++)
+      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
+  
+    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
+    thread_for(out_oidx,out_grid->oSites(),{
+	Coordinate out_ocorr; 
+	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
+      
+	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
+	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
+	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
+	
+	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
+	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
+	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
+	  int in_oidx = 0, in_lane = 0;
+	  for(int d=0;d<in_grid->_ndimension;d++){
+	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
+	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
+	  }
+	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
+	}
+      });
+
+    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
+    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
+    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
+    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
   }
+
+  //Prevent moving or copying
+  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
+  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
+  
+  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
+
+  ~precisionChangeWorkspace(){
+    acceleratorFreeDevice(fmap_device);
+  }
+};
+
+
+//Convert a lattice of one precision to another. The input workspace contains the mapping data.
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
   out.Checkerboard() = in.Checkerboard();
-  GridBase *in_grid=in.Grid();
-  GridBase *out_grid = out.Grid();
 
   typedef typename VobjOut::scalar_object SobjOut;
   typedef typename VobjIn::scalar_object SobjIn;
 
-  int ndim = out.Grid()->Nd();
-  int out_nsimd = out_grid->Nsimd();
-    
-  std::vector<Coordinate > out_icoor(out_nsimd);
-      
-  for(int lane=0; lane < out_nsimd; lane++){
-    out_icoor[lane].resize(ndim);
-    out_grid->iCoorFromIindex(out_icoor[lane], lane);
-  }
-        
-  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
-  unvectorizeToLexOrdArray(in_slex_conv, in);
-    
-  autoView( out_v , out, CpuWrite);
-  thread_for(out_oidx,out_grid->oSites(),{
-    Coordinate out_ocoor(ndim);
-    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
+  typedef typename SobjIn::scalar_type SfundIn; //"fundamental" complex/real data types
+  typedef typename SobjOut::scalar_type SfundOut;
 
-    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
+  constexpr int Nsimd_out = VobjOut::Nsimd();
+  constexpr int Nfund_in = sizeof(SobjIn)/sizeof(SfundIn);
+  constexpr int Nfund_out = sizeof(SobjOut)/sizeof(SfundOut); //these should be the same!
 
-    Coordinate lcoor(out_grid->Nd());
-      
-    for(int lane=0; lane < out_nsimd; lane++){
-      for(int mu=0;mu<ndim;mu++)
-	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
+  static_assert(Nfund_in == Nfund_out, "Expect input and output object types to contain the same number of fundamental data but with different precision!");
+
+  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
+
+  //Do the copy/precision change
+  autoView( out_v , out, AcceleratorWrite);
+  autoView( in_v , in, AcceleratorRead);
+
+  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
+      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
+      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
+	int in_oidx = fmap_osite[out_lane].first;
+	int in_lane = fmap_osite[out_lane].second;
 	
-      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
-      ptrs[lane] = &in_slex_conv[llex];
-    }
-    merge(out_v[out_oidx], ptrs, 0);
-  });
+	//Room for optimization here by combining the precision change with the read/write to avoid the intermediate scalar objects
+	SobjIn sobj_in = extractLane(in_lane, in_v[in_oidx]);
+	SobjOut sobj_out;
+	SfundIn tmp_in;
+	SfundOut tmp_out;
+	for(int i=0;i<Nfund_out;i++){ //the blessed way to do type punning!
+	  memcpy( (char*)(&tmp_in), (char*)(&sobj_in) + i*sizeof(SfundIn),  sizeof(SfundIn) );
+	  tmp_out = tmp_in; //the precision change
+	  memcpy( (char*)(&sobj_out) + i*sizeof(SfundOut), (char*)(&tmp_out), sizeof(SfundOut) );
+	}
+	insertLane(out_lane, out_v[out_oidx], sobj_out);
+      }
+    });
 }
 
+//Convert a Lattice from one precision to another
+//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
+  precisionChange(out, in, workspace);
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////

From 80c14be65e337e225abb26494d2f6292105a5792 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 6 Jan 2021 09:34:44 -0500
Subject: [PATCH 09/41] Added core test to check precision change

---
 tests/core/Test_precision_change.cc | 114 ++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 tests/core/Test_precision_change.cc

diff --git a/tests/core/Test_precision_change.cc b/tests/core/Test_precision_change.cc
new file mode 100644
index 00000000..66819f31
--- /dev/null
+++ b/tests/core/Test_precision_change.cc
@@ -0,0 +1,114 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/core/Test_precision_change.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+
+int main (int argc, char ** argv){
+  Grid_init(&argc, &argv);
+  int Ls = 16;
+  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " and Ls=" << Ls << std::endl;
+  GridCartesian* UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
+  GridCartesian* FGrid_d = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_d);
+  GridRedBlackCartesian* FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_d);
+
+  GridCartesian* UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridCartesian* FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_f);
+  GridRedBlackCartesian* FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_f);
+
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid_d);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid_d);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  //Gauge fields
+  LatticeGaugeFieldD Umu_d(UGrid_d);
+  LatticeGaugeFieldF Umu_f(UGrid_f);
+  LatticeGaugeFieldD Umu_d_r(UGrid_d);
+  LatticeGaugeFieldD Utmp_d(UGrid_d);
+
+  for(int i=0;i<5;i++){
+    random(RNG4, Umu_d);
+
+    precisionChange(Umu_f, Umu_d);
+    std::cout << GridLogMessage << "Norm of double-prec and single-prec gauge fields (should be ~equal): " << norm2(Umu_d) << " " << norm2(Umu_f) << std::endl;
+    precisionChange(Umu_d_r, Umu_f);
+    RealD normdiff = axpy_norm(Utmp_d, -1.0, Umu_d_r, Umu_d);
+    std::cout << GridLogMessage << "Norm of difference of back-converted double-prec gauge fields (should be ~0) = " << normdiff << std::endl;
+  }
+
+  //Fermion fields
+  LatticeFermionD psi_d(FGrid_d);
+  LatticeFermionF psi_f(FGrid_f);
+  LatticeFermionD psi_d_r(FGrid_d);
+  LatticeFermionD psi_tmp_d(FGrid_d);
+
+  for(int i=0;i<5;i++){
+    random(RNG5, psi_d);
+
+    precisionChange(psi_f, psi_d);
+    std::cout << GridLogMessage << "Norm of double-prec and single-prec fermion fields (should be ~equal): " << norm2(psi_d) << " " << norm2(psi_f) << std::endl;
+    precisionChange(psi_d_r, psi_f);
+    RealD normdiff = axpy_norm(psi_tmp_d, -1.0, psi_d_r, psi_d);
+    std::cout << GridLogMessage << "Norm of difference of back-converted double-prec fermion fields (should be ~0)= " << normdiff << std::endl;
+  }
+
+  //Checkerboarded fermion fields
+  LatticeFermionD psi_cb_d(FrbGrid_d);
+  LatticeFermionF psi_cb_f(FrbGrid_f);
+  LatticeFermionD psi_cb_d_r(FrbGrid_d);
+  LatticeFermionD psi_cb_tmp_d(FrbGrid_d);
+
+  for(int i=0;i<5;i++){
+    random(RNG5, psi_d);
+    pickCheckerboard(Odd, psi_cb_d, psi_d);
+     
+    precisionChange(psi_cb_f, psi_cb_d);
+    std::cout << GridLogMessage << "Norm of odd-cb double-prec and single-prec fermion fields (should be ~equal): " << norm2(psi_cb_d) << " " << norm2(psi_cb_f) << std::endl;
+    precisionChange(psi_cb_d_r, psi_cb_f);
+    RealD normdiff = axpy_norm(psi_cb_tmp_d, -1.0, psi_cb_d_r, psi_cb_d);
+    std::cout << GridLogMessage << "Norm of difference of back-converted odd-cb double-prec fermion fields (should be ~0)= " << normdiff << std::endl;
+
+
+    pickCheckerboard(Even, psi_cb_d, psi_d);
+     
+    precisionChange(psi_cb_f, psi_cb_d);
+    std::cout << GridLogMessage << "Norm of even-cb double-prec and single-prec fermion fields (should be ~equal): " << norm2(psi_cb_d) << " " << norm2(psi_cb_f) << std::endl;
+    precisionChange(psi_cb_d_r, psi_cb_f);
+    normdiff = axpy_norm(psi_cb_tmp_d, -1.0, psi_cb_d_r, psi_cb_d);
+    std::cout << GridLogMessage << "Norm of difference of back-converted even-cb double-prec fermion fields (should be ~0)= " << normdiff << std::endl;
+  }
+
+
+
+  Grid_finalize();
+}

From 287bac946f1175464599221d4456be9575212fa8 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 6 Jan 2021 09:50:41 -0500
Subject: [PATCH 10/41] ConjugateGradientMixedPrec now stores final true
 residual and uses the precisionChange workspaces for improved efficiency

---
 .../algorithms/iterative/ConjugateGradientMixedPrec.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
index 08942097..8f400625 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -48,6 +48,7 @@ NAMESPACE_BEGIN(Grid);
     Integer TotalInnerIterations; //Number of inner CG iterations
     Integer TotalOuterIterations; //Number of restarts
     Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+    RealD TrueResidual;
 
     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
     LinearFunction<FieldF> *guesser;
@@ -79,6 +80,11 @@ NAMESPACE_BEGIN(Grid);
     RealD stop = src_norm * Tolerance*Tolerance;
 
     GridBase* DoublePrecGrid = src_d_in.Grid();
+
+    //Generate precision change workspaces
+    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
+    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
+
     FieldD tmp_d(DoublePrecGrid);
     tmp_d.Checkerboard() = cb;
     
@@ -119,7 +125,7 @@ NAMESPACE_BEGIN(Grid);
       while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 
       PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
       PrecChangeTimer.Stop();
       
       sol_f = Zero();
@@ -137,7 +143,7 @@ NAMESPACE_BEGIN(Grid);
       
       //Convert sol back to double and add to double prec solution
       PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
       PrecChangeTimer.Stop();
       
       axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -149,6 +155,7 @@ NAMESPACE_BEGIN(Grid);
     ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
     CG_d(Linop_d, src_d_in, sol_d);
     TotalFinalStepIterations = CG_d.IterationsToComplete;
+    TrueResidual = CG_d.TrueResidual;
 
     TotalTimer.Stop();
     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;

From 1fb41a4300346bfb4e4f315999a6bf9259e720d7 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 6 Jan 2021 11:50:56 -0500
Subject: [PATCH 11/41] Added copyLane function to Tensor_extract_merge.h which
 copies one lane of data from an input tensor object to a different lane of an
 output tensor object of potentially different precision precisionChange
 lattice function now uses copyLane to remove need for temporary scalar
 objects, reducing register footprint and significantly improving performance

---
 Grid/lattice/Lattice_transfer.h     | 26 +++---------------
 Grid/tensors/Tensor_extract_merge.h | 41 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 9af347e8..0a5bd458 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -1058,19 +1058,10 @@ public:
 //Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
+  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
   out.Checkerboard() = in.Checkerboard();
-
-  typedef typename VobjOut::scalar_object SobjOut;
-  typedef typename VobjIn::scalar_object SobjIn;
-
-  typedef typename SobjIn::scalar_type SfundIn; //"fundamental" complex/real data types
-  typedef typename SobjOut::scalar_type SfundOut;
-
   constexpr int Nsimd_out = VobjOut::Nsimd();
-  constexpr int Nfund_in = sizeof(SobjIn)/sizeof(SfundIn);
-  constexpr int Nfund_out = sizeof(SobjOut)/sizeof(SfundOut); //these should be the same!
-
-  static_assert(Nfund_in == Nfund_out, "Expect input and output object types to contain the same number of fundamental data but with different precision!");
 
   std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
 
@@ -1083,18 +1074,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const pre
       for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
-	
-	//Room for optimization here by combining the precision change with the read/write to avoid the intermediate scalar objects
-	SobjIn sobj_in = extractLane(in_lane, in_v[in_oidx]);
-	SobjOut sobj_out;
-	SfundIn tmp_in;
-	SfundOut tmp_out;
-	for(int i=0;i<Nfund_out;i++){ //the blessed way to do type punning!
-	  memcpy( (char*)(&tmp_in), (char*)(&sobj_in) + i*sizeof(SfundIn),  sizeof(SfundIn) );
-	  tmp_out = tmp_in; //the precision change
-	  memcpy( (char*)(&sobj_out) + i*sizeof(SfundOut), (char*)(&tmp_out), sizeof(SfundOut) );
-	}
-	insertLane(out_lane, out_v[out_oidx], sobj_out);
+	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
       }
     });
 }
diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h
index f1ded209..a563eac7 100644
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -208,5 +208,46 @@ void merge(vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
 }
 
 
+
+//////////////////////////////////////////////////////////////////////////////////
+//Copy a single lane of a SIMD tensor type from one object to another
+//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
+///////////////////////////////////////////////////////////////////////////////////
+template<class vobjOut, class vobjIn>
+accelerator_inline 
+void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
+{
+  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  typedef typename vobjOut::vector_type ovector_type;  
+  typedef typename vobjIn::vector_type ivector_type;  
+  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
+  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
+  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
+
+  typedef typename vobjOut::scalar_type oscalar_type;  
+  typedef typename vobjIn::scalar_type iscalar_type;  
+  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
+  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
+
+  typedef oextract_type * opointer;
+  typedef iextract_type * ipointer;
+
+  constexpr int oNsimd=ovector_type::Nsimd();
+  constexpr int iNsimd=ivector_type::Nsimd();
+
+  iscalar_type itmp;
+  oscalar_type otmp;
+
+  opointer __restrict__  op = (opointer)&vecOut;
+  ipointer __restrict__  ip = (ipointer)&vecIn;
+  for(int w=0;w<owords;w++){
+    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
+    otmp = itmp; //potential precision change
+    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
+  }
+}
+
+
 NAMESPACE_END(Grid);
 

From 1b84f592738c32fe58571c349b50168e4c77cabe Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 6 Jan 2021 12:21:30 -0500
Subject: [PATCH 12/41] Added a mixed precision multishift algorithm for which
 the matrix multiplies are performed in single precision but the search
 directions are accumulated in double precision. A reliable update step is
 performed at a tunable frequency to correct the residual. A final mixed-prec
 single-shift solve is performed on each pole to perform cleanup if necessary.
 A test is provided to demonstrate the algorithm.

---
 Grid/algorithms/Algorithms.h                  |   1 +
 .../ConjugateGradientMultiShiftMixedPrec.h    | 410 ++++++++++++++++++
 tests/solver/Test_dwf_multishift_mixedprec.cc |  93 ++++
 3 files changed, 504 insertions(+)
 create mode 100644 Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
 create mode 100644 tests/solver/Test_dwf_multishift_mixedprec.cc

diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h
index 7f27784b..47a0a92b 100644
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
new file mode 100644
index 00000000..03905c63
--- /dev/null
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,410 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
+//The residual is stored in single precision, but the search directions and solution are stored in double precision. 
+//Every update_freq iterations the residual is corrected in double precision. 
+    
+//For safety the a final regular CG is applied to clean up if necessary
+
+//Linop to add shift to input linop, used in cleanup CG
+namespace ConjugateGradientMultiShiftMixedPrecSupport{
+template<typename Field>
+class ShiftedLinop: public LinearOperatorBase<Field>{
+public:
+  LinearOperatorBase<Field> &linop_base;
+  RealD shift;
+
+  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
+
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
+  
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+
+  void HermOp(const Field &in, Field &out){
+    linop_base.HermOp(in, out);
+    axpy(out, shift, in, out);
+  }    
+
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+};
+};
+
+
+template<class FieldD, class FieldF,
+	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
+					     public OperatorFunction<FieldD>
+{
+public:                                                
+
+  using OperatorFunction<FieldD>::operator();
+
+  RealD   Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
+  int verbose;
+  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;
+
+  int ReliableUpdateFreq; //number of iterations between reliable updates
+
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  LinearOperatorBase<FieldF> &Linop_f; //single precision
+
+  ConjugateGradientMultiShiftMixedPrec(Integer maxit, MultiShiftFunction &_shifts,
+				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
+				       int _ReliableUpdateFreq
+				       ) : 
+    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
+  { 
+    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
+  {
+    GridBase *grid = src.Grid();
+    int nshift = shifts.order;
+    std::vector<FieldD> results(nshift,grid);
+    (*this)(Linop,src,results,psi);
+  }
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
+  {
+    int nshift = shifts.order;
+
+    (*this)(Linop,src,results);
+  
+    psi = shifts.norm*src;
+    for(int i=0;i<nshift;i++){
+      psi = psi + shifts.residues[i]*results[i];
+    }
+
+    return;
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
+  { 
+    GridBase *DoublePrecGrid = src_d.Grid();
+    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
+    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
+
+    ////////////////////////////////////////////////////////////////////////
+    // Convenience references to the info stored in "MultiShiftFunction"
+    ////////////////////////////////////////////////////////////////////////
+    int nshift = shifts.order;
+
+    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+    std::vector<RealD> &mresidual(shifts.tolerances);
+    std::vector<RealD> alpha(nshift,1.0);
+
+    //Double precision search directions
+    FieldD p_d(DoublePrecGrid);
+    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
+
+    FieldD tmp_d(DoublePrecGrid);
+    FieldD r_d(DoublePrecGrid);
+    FieldD mmp_d(DoublePrecGrid);
+
+    assert(psi_d.size()==nshift);
+    assert(mass.size()==nshift);
+    assert(mresidual.size()==nshift);
+  
+    // dynamic sized arrays on stack; 2d is a pain with vector
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
+  
+    const int       primary =0;
+  
+    //Primary shift fields CG iteration
+    RealD a,b,c,d;
+    RealD cp,bp,qq; //prev
+  
+    // Matrix mult fields
+    FieldF r_f(SinglePrecGrid);
+    FieldF p_f(SinglePrecGrid);
+    FieldF tmp_f(SinglePrecGrid);
+    FieldF mmp_f(SinglePrecGrid);
+    FieldF src_f(SinglePrecGrid);
+    precisionChange(src_f, src_d, wk_f_from_d);
+
+    // Check lightest mass
+    for(int s=0;s<nshift;s++){
+      assert( mass[s]>= mass[primary] );
+      converged[s]=0;
+    }
+  
+    // Wire guess to zero
+    // Residuals "r" are src
+    // First search direction "p" is also src
+    cp = norm2(src_d);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi_d[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
+    for(int s=0;s<nshift;s++){
+      rsq[s] = cp * mresidual[s] * mresidual[s];
+      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
+      ps_d[s] = src_d;
+    }
+    // r and p for primary
+    r_f=src_f; //residual maintained in single
+    p_f=src_f;
+    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
+  
+    //MdagM+m[0]
+    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
+    axpy(mmp_f,mass[0],p_f,mmp_f);
+    RealD rn = norm2(p_f);
+    d += rn*mass[0];
+
+    b = -cp /d;
+  
+    // Set up the various shift variables
+    int       iz=0;
+    z[0][1-iz] = 1.0;
+    z[0][iz]   = 1.0;
+    bs[0]      = b;
+    for(int s=1;s<nshift;s++){
+      z[s][1-iz] = 1.0;
+      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+      bs[s]      = b*z[s][iz]; 
+    }
+  
+    // r += b[0] A.p[0]
+    // c= norm(r)
+    c=axpy_norm(r_f,b,mmp_f,r_f);
+  
+    for(int s=0;s<nshift;s++) {
+      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
+    }
+  
+    ///////////////////////////////////////
+    // Timers
+    ///////////////////////////////////////
+    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
+
+    SolverTimer.Start();
+  
+    // Iteration loop
+    int k;
+  
+    for (k=1;k<=MaxIterations;k++){    
+      a = c /cp;
+
+      //Update double precision search direction by residual
+      PrecChangeTimer.Start();
+      precisionChange(r_d, r_f, wk_d_from_f);
+      PrecChangeTimer.Stop();
+
+      AXPYTimer.Start();
+      axpy(p_d,a,p_d,r_d); 
+
+      for(int s=0;s<nshift;s++){
+	if ( ! converged[s] ) { 
+	  if (s==0){
+	    axpy(ps_d[s],a,ps_d[s],r_d);
+	  } else{
+	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
+	  }
+	}
+      }
+      AXPYTimer.Stop();
+
+      PrecChangeTimer.Start();
+      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
+      PrecChangeTimer.Stop();
+
+      cp=c;
+      MatrixTimer.Start();  
+      Linop_f.HermOp(p_f,mmp_f); 
+      d=real(innerProduct(p_f,mmp_f));    
+      MatrixTimer.Stop();  
+
+      AXPYTimer.Start();
+      axpy(mmp_f,mass[0],p_f,mmp_f);
+      AXPYTimer.Stop();
+      RealD rn = norm2(p_f);
+      d += rn*mass[0];
+    
+      bp=b;
+      b=-cp/d;
+    
+      // Toggle the recurrence history
+      bs[0] = b;
+      iz = 1-iz;
+      ShiftTimer.Start();
+      for(int s=1;s<nshift;s++){
+	if((!converged[s])){
+	  RealD z0 = z[s][1-iz];
+	  RealD z1 = z[s][iz];
+	  z[s][iz] = z0*z1*bp
+	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+	}
+      }
+      ShiftTimer.Stop();
+
+      //Update double precision solutions
+      AXPYTimer.Start();
+      for(int s=0;s<nshift;s++){
+	int ss = s;
+	if( (!converged[s]) ) { 
+	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
+	}
+      }
+
+      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
+      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
+      AXPYTimer.Stop();
+
+      c = c_f;
+
+      if(k % ReliableUpdateFreq == 0){
+	//Replace r with true residual
+	MatrixTimer.Start();  
+	Linop_d.HermOp(psi_d[0],mmp_d); 
+	MatrixTimer.Stop();  
+
+	AXPYTimer.Start();
+	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
+
+	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
+	AXPYTimer.Stop();
+
+	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
+	
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r_d, wk_f_from_d);
+	PrecChangeTimer.Stop();
+	c = c_d;
+      }
+    
+      // Convergence checks
+      int all_converged = 1;
+      for(int s=0;s<nshift;s++){
+      
+	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
+	
+	  RealD css  = c * z[s][iz]* z[s][iz];
+	
+	  if(css<rsq[s]){
+	    if ( ! converged[s] )
+	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    converged[s]=1;
+	  } else {
+	    all_converged=0;
+	  }
+
+	}
+      }
+
+      if ( all_converged ){
+
+	SolverTimer.Stop();
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+      
+	// Check answers 
+	for(int s=0; s < nshift; s++) { 
+	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
+	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
+	  axpy(r_d,-alpha[s],src_d,tmp_d);
+	  RealD rn = norm2(r_d);
+	  RealD cn = norm2(src_d);
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
+
+	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
+	  if(rn >= rsq[s]){
+	    CleanupTimer.Start();
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
+
+	    //Setup linear operators for final cleanup
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
+					       
+	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
+	    cg(src_d, psi_d[s]);
+	    
+	    TrueResidualShift[s] = cg.TrueResidual;
+	    CleanupTimer.Stop();
+	  }
+	}
+
+	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
+
+	IterationsToComplete = k;	
+
+	return;
+      }
+
+   
+    }
+    // ugly hack
+    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
+    //  assert(0);
+  }
+
+};
+NAMESPACE_END(Grid);
+#endif
diff --git a/tests/solver/Test_dwf_multishift_mixedprec.cc b/tests/solver/Test_dwf_multishift_mixedprec.cc
new file mode 100644
index 00000000..de9fc979
--- /dev/null
+++ b/tests/solver/Test_dwf_multishift_mixedprec.cc
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_multishift_mixedprec.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc, &argv);
+
+  const int Ls = 16;
+  GridCartesian* UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
+  GridCartesian* FGrid_d = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_d);
+  GridRedBlackCartesian* FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_d);
+
+  GridCartesian* UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
+  GridCartesian* FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_f);
+  GridRedBlackCartesian* FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_f);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid_d);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid_d);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermionD src_d(FGrid_d);
+  random(RNG5, src_d);
+
+  LatticeGaugeFieldD Umu_d(UGrid_d);
+  SU<Nc>::HotConfiguration(RNG4, Umu_d);
+
+  LatticeGaugeFieldF Umu_f(UGrid_f);
+  precisionChange(Umu_f, Umu_d);
+
+  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;
+
+  RealD mass = 0.01;
+  RealD M5 = 1.8;
+  DomainWallFermionD Ddwf_d(Umu_d, *FGrid_d, *FrbGrid_d, *UGrid_d, *UrbGrid_d, mass, M5);
+  DomainWallFermionF Ddwf_f(Umu_f, *FGrid_f, *FrbGrid_f, *UGrid_f, *UrbGrid_f, mass, M5);
+
+  LatticeFermionD src_o_d(FrbGrid_d);
+  pickCheckerboard(Odd, src_o_d, src_d);
+
+  SchurDiagMooeeOperator<DomainWallFermionD, LatticeFermionD> HermOpEO_d(Ddwf_d);
+  SchurDiagMooeeOperator<DomainWallFermionF, LatticeFermionF> HermOpEO_f(Ddwf_f);
+
+  AlgRemez remez(1e-4, 64, 50);
+  int order = 15;
+  remez.generateApprox(order, 1, 2); //sqrt
+
+  MultiShiftFunction shifts(remez, 1e-10, false);
+
+  int relup_freq = 50;
+  double t1=usecond();
+  ConjugateGradientMultiShiftMixedPrec<LatticeFermionD,LatticeFermionF> mcg(10000, shifts, FrbGrid_f, HermOpEO_f, relup_freq);
+
+  std::vector<LatticeFermionD> results_o_d(order, FrbGrid_d);
+  mcg(HermOpEO_d, src_o_d, results_o_d);
+  double t2=usecond();
+
+  std::cout<<GridLogMessage << "Test: Total usec    =   "<< (t2-t1)<<std::endl;
+
+  Grid_finalize();
+}

From 901ee77b84d0b31e7ef80a892b252fc80ee8c3bc Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 20 Jan 2021 11:45:44 -0500
Subject: [PATCH 13/41] Mixed precision multishift test can now be performed
 with/without G-parity using cmdline check and can load a pregenerated
 configuration

---
 tests/solver/Test_dwf_multishift_mixedprec.cc | 74 +++++++++++++++----
 1 file changed, 61 insertions(+), 13 deletions(-)

diff --git a/tests/solver/Test_dwf_multishift_mixedprec.cc b/tests/solver/Test_dwf_multishift_mixedprec.cc
index de9fc979..ea05b8c7 100644
--- a/tests/solver/Test_dwf_multishift_mixedprec.cc
+++ b/tests/solver/Test_dwf_multishift_mixedprec.cc
@@ -29,10 +29,8 @@ Author: Christopher Kelly <ckelly@bnl.gov>
 
 using namespace Grid;
 
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc, &argv);
-
+template<typename SpeciesD, typename SpeciesF>
+void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams &params){
   const int Ls = 16;
   GridCartesian* UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
   GridRedBlackCartesian* UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
@@ -44,6 +42,9 @@ int main (int argc, char ** argv)
   GridCartesian* FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_f);
   GridRedBlackCartesian* FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_f);
 
+  typedef typename SpeciesD::FermionField FermionFieldD;
+  typedef typename SpeciesF::FermionField FermionFieldF;
+  
   std::vector<int> seeds4({1, 2, 3, 4});
   std::vector<int> seeds5({5, 6, 7, 8});
   GridParallelRNG RNG5(FGrid_d);
@@ -51,11 +52,26 @@ int main (int argc, char ** argv)
   GridParallelRNG RNG4(UGrid_d);
   RNG4.SeedFixedIntegers(seeds4);
 
-  LatticeFermionD src_d(FGrid_d);
+  FermionFieldD src_d(FGrid_d);
   random(RNG5, src_d);
 
   LatticeGaugeFieldD Umu_d(UGrid_d);
-  SU<Nc>::HotConfiguration(RNG4, Umu_d);
+
+  bool cfg_loaded=false;
+  for(int i=1;i<argc;i++){
+    if(std::string(argv[i]) == "--load_config"){
+      assert(i != argc-1);
+      std::string file = argv[i+1];
+      NerscIO io;
+      FieldMetaData metadata;
+      io.readConfiguration(Umu_d, metadata, file);
+      cfg_loaded=true;
+      break;
+    }
+  }
+
+  if(!cfg_loaded)
+    SU<Nc>::HotConfiguration(RNG4, Umu_d);
 
   LatticeGaugeFieldF Umu_f(UGrid_f);
   precisionChange(Umu_f, Umu_d);
@@ -64,14 +80,14 @@ int main (int argc, char ** argv)
 
   RealD mass = 0.01;
   RealD M5 = 1.8;
-  DomainWallFermionD Ddwf_d(Umu_d, *FGrid_d, *FrbGrid_d, *UGrid_d, *UrbGrid_d, mass, M5);
-  DomainWallFermionF Ddwf_f(Umu_f, *FGrid_f, *FrbGrid_f, *UGrid_f, *UrbGrid_f, mass, M5);
+  SpeciesD Ddwf_d(Umu_d, *FGrid_d, *FrbGrid_d, *UGrid_d, *UrbGrid_d, mass, M5, params);
+  SpeciesF Ddwf_f(Umu_f, *FGrid_f, *FrbGrid_f, *UGrid_f, *UrbGrid_f, mass, M5, params);
 
-  LatticeFermionD src_o_d(FrbGrid_d);
+  FermionFieldD src_o_d(FrbGrid_d);
   pickCheckerboard(Odd, src_o_d, src_d);
 
-  SchurDiagMooeeOperator<DomainWallFermionD, LatticeFermionD> HermOpEO_d(Ddwf_d);
-  SchurDiagMooeeOperator<DomainWallFermionF, LatticeFermionF> HermOpEO_f(Ddwf_f);
+  SchurDiagMooeeOperator<SpeciesD, FermionFieldD> HermOpEO_d(Ddwf_d);
+  SchurDiagMooeeOperator<SpeciesF, FermionFieldF> HermOpEO_f(Ddwf_f);
 
   AlgRemez remez(1e-4, 64, 50);
   int order = 15;
@@ -81,13 +97,45 @@ int main (int argc, char ** argv)
 
   int relup_freq = 50;
   double t1=usecond();
-  ConjugateGradientMultiShiftMixedPrec<LatticeFermionD,LatticeFermionF> mcg(10000, shifts, FrbGrid_f, HermOpEO_f, relup_freq);
+  ConjugateGradientMultiShiftMixedPrec<FermionFieldD,FermionFieldF> mcg(10000, shifts, FrbGrid_f, HermOpEO_f, relup_freq);
 
-  std::vector<LatticeFermionD> results_o_d(order, FrbGrid_d);
+  std::vector<FermionFieldD> results_o_d(order, FrbGrid_d);
   mcg(HermOpEO_d, src_o_d, results_o_d);
   double t2=usecond();
 
   std::cout<<GridLogMessage << "Test: Total usec    =   "<< (t2-t1)<<std::endl;
+}
+
+
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc, &argv);
+
+  bool gparity = false;
+  int gpdir;
+
+  for(int i=1;i<argc;i++){
+    std::string arg(argv[i]);
+    if(arg == "--Gparity"){
+      assert(i!=argc-1);
+      gpdir = std::stoi(argv[i+1]);
+      assert(gpdir >= 0 && gpdir <= 2); //spatial!
+      gparity = true;
+    }
+  }
+  if(gparity){
+    std::cout << "Running test with G-parity BCs in " << gpdir << " direction" << std::endl;
+    GparityWilsonImplParams params;
+    params.twists[gpdir] = 1;
+    run_test<GparityDomainWallFermionD, GparityDomainWallFermionF>(argc,argv,params);
+  }else{
+    std::cout << "Running test with periodic BCs" << std::endl;
+    WilsonImplParams params;
+    run_test<DomainWallFermionD, DomainWallFermionF>(argc,argv,params);
+  }
 
   Grid_finalize();
 }

From 7a06826cf145b15c0e02f236e063ce0157343289 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 20 Jan 2021 13:31:50 -0500
Subject: [PATCH 14/41] Added option to NerscIO to disable exit on failing
 plaquette check allowing for circumvention of factor of 2 error in
 CPS-generated G-parity config headers Adapted mixed-prec multi-shift test to
 new way to pass gauge BC directions and added cmdline option to perform the
 G-parity plaquette comparison with the corrected plaquette when loading
 config

---
 Grid/parallelIO/NerscIO.h                     |  6 ++-
 tests/solver/Test_dwf_multishift_mixedprec.cc | 37 +++++++++++++++++--
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h
index 3ebdf0cc..ed86303a 100644
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -39,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
-
   typedef Lattice<vLorentzColourMatrixD> GaugeField;
 
+  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
+  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
+
   static inline void truncate(std::string file){
     std::ofstream fout(file,std::ios::out);
   }
@@ -198,7 +200,7 @@ public:
       std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
       exit(0);
     }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
     assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
     assert(nersc_csum == header.checksum );
       
diff --git a/tests/solver/Test_dwf_multishift_mixedprec.cc b/tests/solver/Test_dwf_multishift_mixedprec.cc
index ea05b8c7..914b86d0 100644
--- a/tests/solver/Test_dwf_multishift_mixedprec.cc
+++ b/tests/solver/Test_dwf_multishift_mixedprec.cc
@@ -29,7 +29,7 @@ Author: Christopher Kelly <ckelly@bnl.gov>
 
 using namespace Grid;
 
-template<typename SpeciesD, typename SpeciesF>
+template<typename SpeciesD, typename SpeciesF, typename GaugeStatisticsType>
 void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams &params){
   const int Ls = 16;
   GridCartesian* UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
@@ -57,6 +57,15 @@ void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams &param
 
   LatticeGaugeFieldD Umu_d(UGrid_d);
 
+  //CPS-created G-parity ensembles have a factor of 2 error in the plaquette that causes the read to fail unless we workaround it
+  bool gparity_plaquette_fix = false;
+  for(int i=1;i<argc;i++){
+    if(std::string(argv[i]) == "--gparity_plaquette_fix"){
+      gparity_plaquette_fix=true;
+      break;
+    }
+  }
+
   bool cfg_loaded=false;
   for(int i=1;i<argc;i++){
     if(std::string(argv[i]) == "--load_config"){
@@ -64,7 +73,22 @@ void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams &param
       std::string file = argv[i+1];
       NerscIO io;
       FieldMetaData metadata;
-      io.readConfiguration(Umu_d, metadata, file);
+
+      if(gparity_plaquette_fix) NerscIO::exitOnReadPlaquetteMismatch() = false;
+
+      io.readConfiguration<GaugeStatisticsType>(Umu_d, metadata, file);
+
+      if(gparity_plaquette_fix){
+	metadata.plaquette *= 2.; //correct header value
+
+	//Get the true plaquette
+	FieldMetaData tmp;
+	GaugeStatisticsType gs; gs(Umu_d, tmp);
+	
+	std::cout << "After correction: plaqs " << tmp.plaquette << " " << metadata.plaquette << std::endl;
+	assert(fabs(tmp.plaquette -metadata.plaquette ) < 1.0e-5 );
+      }
+
       cfg_loaded=true;
       break;
     }
@@ -130,11 +154,16 @@ int main (int argc, char ** argv)
     std::cout << "Running test with G-parity BCs in " << gpdir << " direction" << std::endl;
     GparityWilsonImplParams params;
     params.twists[gpdir] = 1;
-    run_test<GparityDomainWallFermionD, GparityDomainWallFermionF>(argc,argv,params);
+    
+    std::vector<int> conj_dirs(Nd,0);
+    conj_dirs[gpdir] = 1;
+    ConjugateGimplD::setDirections(conj_dirs);
+
+    run_test<GparityDomainWallFermionD, GparityDomainWallFermionF, ConjugateGaugeStatistics>(argc,argv,params);
   }else{
     std::cout << "Running test with periodic BCs" << std::endl;
     WilsonImplParams params;
-    run_test<DomainWallFermionD, DomainWallFermionF>(argc,argv,params);
+    run_test<DomainWallFermionD, DomainWallFermionF, PeriodicGaugeStatistics>(argc,argv,params);
   }
 
   Grid_finalize();

From d161c2dc3554e97e24687bb19c4d41b3dfea7166 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 20 Jan 2021 15:42:06 -0500
Subject: [PATCH 15/41] Improved formating of timing output in mixed-prec
 multishift In test of mixed-prec multishift, added comparison against full
 double precision multishift both for timing and to cross-check the results

---
 .../ConjugateGradientMultiShiftMixedPrec.h       | 11 ++++++-----
 tests/solver/Test_dwf_multishift_mixedprec.cc    | 16 +++++++++++++++-
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
index 03905c63..5ea6d075 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -386,12 +386,13 @@ public:
 	}
 
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 
 	IterationsToComplete = k;	
 
diff --git a/tests/solver/Test_dwf_multishift_mixedprec.cc b/tests/solver/Test_dwf_multishift_mixedprec.cc
index 914b86d0..bdede459 100644
--- a/tests/solver/Test_dwf_multishift_mixedprec.cc
+++ b/tests/solver/Test_dwf_multishift_mixedprec.cc
@@ -127,7 +127,21 @@ void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams &param
   mcg(HermOpEO_d, src_o_d, results_o_d);
   double t2=usecond();
 
-  std::cout<<GridLogMessage << "Test: Total usec    =   "<< (t2-t1)<<std::endl;
+  //Crosscheck double and mixed prec results
+  ConjugateGradientMultiShift<FermionFieldD> dmcg(10000, shifts);
+  std::vector<FermionFieldD> results_o_d_2(order, FrbGrid_d);
+  dmcg(HermOpEO_d, src_o_d, results_o_d_2);
+  double t3=usecond();
+
+  std::cout << GridLogMessage << "Comparison of mixed prec results to double prec results |mixed - double|^2 :" << std::endl;
+  FermionFieldD tmp(FrbGrid_d);
+  for(int i=0;i<order;i++){
+    RealD ndiff = axpy_norm(tmp, -1., results_o_d[i], results_o_d_2[i]);
+    std::cout << i << " " << ndiff << std::endl;
+  }
+
+  std::cout<<GridLogMessage << "Mixed precision algorithm: Total usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "Double precision algorithm: Total usec    =   "<< (t3-t2)<<std::endl;
 }
 
 

From 6795bbca31768ca7db709d65990ea0e098ef4c0e Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Mon, 25 Jan 2021 14:22:31 -0500
Subject: [PATCH 16/41] Generalized
 GeneralEvenOddRatioRationalPseudoFermionAction such that the multi-shift CG
 algorithm can be overridden by derived classes Added a mixed-precision
 variant of GeneralEvenOddRatioRationalPseudoFermionAction and a verification
 test against double prec class Fixed non-const reference used in passing RHMC
 approx to multishift classes

---
 .../iterative/ConjugateGradientMultiShift.h   |   2 +-
 .../ConjugateGradientMultiShiftMixedPrec.h    |   2 +-
 .../GeneralEvenOddRationalRatio.h             |  69 +++++-----
 .../GeneralEvenOddRationalRatioMixedPrec.h    |  93 ++++++++++++++
 Grid/qcd/action/pseudofermion/PseudoFermion.h |   1 +
 ...st_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc | 119 ++++++++++++++++++
 6 files changed, 255 insertions(+), 31 deletions(-)
 create mode 100644 Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
 create mode 100644 tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc

diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
index 63a6c55f..252e4506 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -52,7 +52,7 @@ public:
   MultiShiftFunction shifts;
   std::vector<RealD> TrueResidualShift;
 
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
     MaxIterations(maxit),
     shifts(_shifts)
   { 
diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
index 5ea6d075..e0cad13b 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -93,7 +93,7 @@ public:
   GridBase* SinglePrecGrid; //Grid for single-precision fields
   LinearOperatorBase<FieldF> &Linop_f; //single precision
 
-  ConjugateGradientMultiShiftMixedPrec(Integer maxit, MultiShiftFunction &_shifts,
+  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index a92b6f7d..aa88e394 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -79,6 +79,27 @@ NAMESPACE_BEGIN(Grid);
       FermionField PhiEven; // the pseudo fermion field for this trajectory
       FermionField PhiOdd; // the pseudo fermion field for this trajectory
 
+    protected:
+      static constexpr bool Numerator = true;
+      static constexpr bool Denominator = false;
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const GaugeField &U){
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+      }
+      
     public:
 
       GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
@@ -188,22 +209,16 @@ NAMESPACE_BEGIN(Grid);
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
+	ImportGauge(U);
 
 	// MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxHalfPowerAction);
-	msCG_M(MdagM,etaOdd,tmp);
+	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
 
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxNegHalfPowerAction);
-	msCG_V(VdagV,tmp,PhiOdd);
-
+	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
+		
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
@@ -215,29 +230,25 @@ NAMESPACE_BEGIN(Grid);
       //////////////////////////////////////////////////////
       virtual RealD S(const GaugeField &U) {
 	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
+	ImportGauge(U);
 
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 
 	// VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPowerAction);
-	msCG_V(VdagV,PhiOdd,X);
+	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
 
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegHalfPowerAction);
-	msCG_M(MdagM,X,Y);
+	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
 
 	// Randomly apply rational bounds checks.
 	if ( param.BoundsCheckFreq != 0 && (rand()%param.BoundsCheckFreq)==0 ) { 
 	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
+	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
 	}
@@ -295,21 +306,21 @@ NAMESPACE_BEGIN(Grid);
 
 	GaugeField   tmp(NumOp.GaugeGrid());
 
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,ApproxHalfPowerMD);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,ApproxNegPowerMD);
+	ImportGauge(U);
 
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
-	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
+
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
-	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
+	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
+
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
-	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+		
+
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+
 
 	RealD ak;
 
diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
new file mode 100644
index 00000000..d154fda5
--- /dev/null
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
+    // cf. GeneralEvenOddRational.h for details
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      
+    template<class ImplD, class ImplF>
+    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
+    private:
+      typedef typename ImplD::FermionField FermionFieldD;
+      typedef typename ImplF::FermionField FermionFieldF;
+
+      FermionOperator<ImplD> & NumOpD;
+      FermionOperator<ImplD> & DenOpD;
+     
+      FermionOperator<ImplF> & NumOpF;
+      FermionOperator<ImplF> & DenOpF;
+
+      Integer ReliableUpdateFreq;
+    protected:
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
+	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
+	precisionChange(Uf, Ud);
+	
+	NumOpD.ImportGauge(Ud);
+	DenOpD.ImportGauge(Ud);
+
+	NumOpF.ImportGauge(Uf);
+	DenOpF.ImportGauge(Uf);
+      }
+      
+    public:
+      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
+							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
+							      const RationalActionParams & p, Integer _ReliableUpdateFreq
+							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
+								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
+      
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
diff --git a/Grid/qcd/action/pseudofermion/PseudoFermion.h b/Grid/qcd/action/pseudofermion/PseudoFermion.h
index aa8c99a3..dfa94043 100644
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@@ -41,6 +41,7 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
 
diff --git a/tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc b/tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc
new file mode 100644
index 00000000..fa14ffa8
--- /dev/null
+++ b/tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc
@@ -0,0 +1,119 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+//This test ensures the mixed precision RHMC gives the same result as the regular double precision
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  // Uses the default minimum norm
+
+  typedef WilsonImplD FermionImplPolicyD;
+  typedef WilsonFermionD FermionActionD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef WilsonImplF FermionImplPolicyF;
+  typedef WilsonFermionF FermionActionF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  auto GridPtrD = TheHMC.Resources.GetCartesian();
+  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
+
+  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(GridPtrD);
+  LatticeGaugeFieldF Uf(GridPtrF);
+
+  Real mass = -0.77;
+  Real pv   = 0.0;
+
+  FermionActionD DenOpD(Ud, *GridPtrD, *GridRBPtrD, mass);
+  FermionActionD NumOpD(Ud, *GridPtrD, *GridRBPtrD, pv);
+
+  FermionActionF DenOpF(Uf, *GridPtrF, *GridRBPtrF, mass);
+  FermionActionF NumOpF(Uf, *GridPtrF, *GridRBPtrF, pv);
+
+  TheHMC.Resources.AddRNGs();
+  PeriodicGimplR::HotConfiguration(TheHMC.Resources.GetParallelRNG(), Ud);
+
+  std::string seed_string = "the_seed";
+
+  //Setup the pseudofermion actions
+  RationalActionParams GenParams;
+  GenParams.inv_pow = 2;
+  GenParams.lo = 1e-2;
+  GenParams.hi = 64.0;
+  GenParams.MaxIter = 1000;
+  GenParams.action_tolerance = GenParams.md_tolerance = 1e-6;
+  GenParams.action_degree = GenParams.md_degree = 6;
+  GenParams.precision = 64;
+  GenParams.BoundsCheckFreq = 20;
+
+  GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> GenD(NumOpD,DenOpD,GenParams);
+  GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> GenFD(NumOpD, DenOpD, 
+													NumOpF, DenOpF, 
+													GenParams, 50);
+  TheHMC.Resources.GetParallelRNG().SeedUniqueString(seed_string);
+  GenD.refresh(Ud, TheHMC.Resources.GetParallelRNG());    
+  RealD Sd = GenD.S(Ud);
+  LatticeGaugeField derivD(Ud);
+  GenD.deriv(Ud,derivD);   
+
+  TheHMC.Resources.GetParallelRNG().SeedUniqueString(seed_string);
+  GenFD.refresh(Ud, TheHMC.Resources.GetParallelRNG());    
+  RealD Sfd = GenFD.S(Ud);
+  LatticeGaugeField derivFD(Ud);
+  GenFD.deriv(Ud,derivFD);   
+
+  //Compare
+  std::cout << "Action : " << Sd << " " << Sfd << " reldiff " << (Sd - Sfd)/Sd << std::endl;
+  
+  LatticeGaugeField diff(Ud);
+  axpy(diff, -1.0, derivD, derivFD);
+  std::cout << "Norm of difference in deriv " << sqrt(norm2(diff)) << std::endl;
+
+  Grid_finalize();
+  return 0;
+}
+

From 9c106d625a1a41489ae220649085ca0696c5c0ab Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Mon, 25 Jan 2021 15:07:44 -0500
Subject: [PATCH 17/41] Added HMC main program designed to reproduce the
 16^3x32x16 DWF+I ensembles with beta=2.13 and Gparity BCs

---
 HMC/DWF2p1fIwasakiGparity.cc | 316 +++++++++++++++++++++++++++++++++++
 1 file changed, 316 insertions(+)
 create mode 100644 HMC/DWF2p1fIwasakiGparity.cc

diff --git a/HMC/DWF2p1fIwasakiGparity.cc b/HMC/DWF2p1fIwasakiGparity.cc
new file mode 100644
index 00000000..ec1f21e2
--- /dev/null
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -0,0 +1,316 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/DWF2p1fIwasakiGparity.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//2+1f DWF+I ensemble with G-parity BCs
+//designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  RatQuoParameters, rat_quo_l,
+				  RatQuoParameters, rat_quo_s);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists("params.xml")){
+    std::cout << GridLogMessage << " Reading params.xml" << std::endl;
+    Grid::XmlReader rd("params.xml");
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File params.xml does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to params.xml.templ" << std::endl;
+    Grid::XmlWriter wr("params.xml.templ");
+    write(wr, "Params", user_params);
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+   // Typedefs to simplify notation
+  typedef GparityDomainWallFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityDomainWallFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = 5; //5 steps of 0.2 for GP* ensembles
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.032;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+
+  //Setup the Grids
+  auto GridPtrD   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
+
+  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(GridPtrD);
+  LatticeGaugeFieldF Uf(GridPtrF);
+
+  
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  ConjugateGimplD::setDirections(dirs4); //gauge BC
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark 
+  ActionLevel<HMCWrapper::Field> Level2(1); //strange quark
+  ActionLevel<HMCWrapper::Field> Level3(8); //gauge (8 increments per step)
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+
+  //Use same parameters as used for 16GPX ensembles
+  RationalActionParams rat_act_params_s;
+
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+
+  //For the 16GPX ensembles we used Hasenbusch mass splitting:
+  //  det[ (M^dag(0.032) M(0.032)) / (M^dag(1.0) M(1.0)) ]^{1/4}    * det[ (M^dag(0.01) M(0.01)) / (M^dag(1.0) M(1.0)) ]^{1/2}
+  //=
+  // [ det[ (M^dag(0.032) M(0.032)) / (M^dag(1.0) M(1.0)) ]^{1/4}  ]^3    * det[ (M^dag(0.01) M(0.01)) / (M^dag(0.032) M(0.032)) ]^{1/2} 
+
+  //I don't know if it's actually necessary for the action objects to be independent instances...
+  int n_hasenbusch_s = 3;
+  std::vector<FermionActionD*> Numerators_sD(n_hasenbusch_s);
+  std::vector<FermionActionD*> Denominators_sD(n_hasenbusch_s);
+  std::vector<FermionActionF*> Numerators_sF(n_hasenbusch_s);
+  std::vector<FermionActionF*> Denominators_sF(n_hasenbusch_s);
+
+  std::vector<MixedPrecRHMC*> Quotients_s(n_hasenbusch_s);
+
+  for(int h=0;h<n_hasenbusch_s;h++){
+    Numerators_sD[h] = new FermionActionD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
+    Denominators_sD[h] = new FermionActionD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,pv_mass,  M5,Params);   
+
+    Numerators_sF[h] = new FermionActionF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
+    Denominators_sF[h] = new FermionActionF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,pv_mass,  M5,Params);   
+
+    //note I define the numerator operator wrt how they appear in the determinant    
+    Quotients_s[h] = new MixedPrecRHMC(*Denominators_sD[h], *Numerators_sD[h], *Denominators_sF[h], *Numerators_sF[h], rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+    Level2.push_back(Quotients_s[h]);
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Light action
+  /////////////////////////////////////////////////////////////
+ 
+  //We don't Hasenbusch the light quark directly, instead the denominator mass is set equal to the strange mass; cf above
+  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
+  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, strange_mass,M5,Params);
+
+  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
+  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, strange_mass,M5,Params);
+
+  RationalActionParams rat_act_params_l;
+  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_l.precision= 60;
+  rat_act_params_l.MaxIter  = 10000;
+  user_params.rat_quo_l.Export(rat_act_params_l);
+ 
+  MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
+  Level1.push_back(&Quotient_l);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+  /////////////////////////////////////////////////////////////
+  // HMC parameters are serialisable
+  
+  if(0){
+    TheHMC.Resources.AddRNGs();
+    ConjugateGimplR::HotConfiguration(TheHMC.Resources.GetParallelRNG(), Ud);
+    Quotient_l.refresh(Ud, TheHMC.Resources.GetParallelRNG());    
+    LatticeGaugeFieldD out(Ud);
+    std::cout << GridLogMessage << " Running the derivative "<< std::endl;
+    Quotient_l.deriv(Ud,out);    
+    std::cout << GridLogMessage << " Finished running the derivative "<< std::endl;
+    Numerator_lD.Report();
+    Denominator_lD.Report();
+  }
+
+
+  if(1){
+    std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+    TheHMC.Run();  // no smearing
+  }
+
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
+

From e6c6f82c52d1f777ba006480d8114032b37cf707 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 27 Jan 2021 11:18:41 -0500
Subject: [PATCH 18/41] Gparity DWF+I HMC main program now has option to
 specify parameter file

---
 HMC/DWF2p1fIwasakiGparity.cc | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/HMC/DWF2p1fIwasakiGparity.cc b/HMC/DWF2p1fIwasakiGparity.cc
index ec1f21e2..cfa8f05c 100644
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -100,17 +100,26 @@ int main(int argc, char **argv) {
   // here make a routine to print all the relevant information on the run
   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
 
+  std::string param_file = "params.xml";
+  for(int i=1;i<argc;i++){
+    if(std::string(argv[i]) == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+      break;
+    }
+  }
+
   //Read the user parameters
   EvolParameters user_params;
   
-  if(fileExists("params.xml")){
-    std::cout << GridLogMessage << " Reading params.xml" << std::endl;
-    Grid::XmlReader rd("params.xml");
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
     read(rd, "Params", user_params);
   }else if(!GlobalSharedMemory::WorldRank){
-    std::cout << GridLogMessage << " File params.xml does not exist" << std::endl;
-    std::cout << GridLogMessage << " Writing xml template to params.xml.templ" << std::endl;
-    Grid::XmlWriter wr("params.xml.templ");
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
     write(wr, "Params", user_params);
 
     std::cout << GridLogMessage << " Done" << std::endl;

From 6cc3ad110c60ab3bcf087dec851011a2dcef66fa Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Fri, 29 Jan 2021 12:35:00 -0500
Subject: [PATCH 19/41] Improved logging output for RHMC bounds checks In
 GenericHMCRunner, exposed functionality for initializing gauge fields and RNG
 for external use

---
 Grid/qcd/action/pseudofermion/Bounds.h        | 18 ++++++-----
 .../GeneralEvenOddRationalRatio.h             | 18 +++++------
 Grid/qcd/hmc/GenericHMCrunner.h               | 32 ++++++++++++-------
 Grid/qcd/hmc/HMCResourceManager.h             |  3 ++
 4 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/Grid/qcd/action/pseudofermion/Bounds.h b/Grid/qcd/action/pseudofermion/Bounds.h
index c428d5ee..47338db7 100644
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@@ -40,10 +40,11 @@ NAMESPACE_BEGIN(Grid);
       X=X-Y;
       RealD Nd = norm2(X);
       std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
       std::cout << "************************* "<<std::endl;
       assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
     }
@@ -90,10 +91,11 @@ NAMESPACE_BEGIN(Grid);
       X=X-Y;
       RealD Nd = norm2(X);
       std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise   = "<<Nd<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
       std::cout << "************************* "<<std::endl;
       assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
     }
diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index aa88e394..d1aaeb93 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -179,7 +179,9 @@ NAMESPACE_BEGIN(Grid);
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
 	return sstream.str();
       }
-      
+
+      //Access the fermion field
+      const FermionField &getPhiOdd() const{ return PhiOdd; }
       
       virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
 
@@ -189,21 +191,19 @@ NAMESPACE_BEGIN(Grid);
 	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
 	//
 	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//	
-	// General gaussian random draws from   e^{x^2/(2 sig^2)} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
-
+	
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
-	RealD scale = std::sqrt(0.5);
 
 	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 
+	// P(eta) \propto e^{- eta^dag eta}
+	//	
+	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
+	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
+	RealD scale = std::sqrt(0.5);
 	gaussian(pRNG,eta);	eta=eta*scale;
 
 	pickCheckerboard(Even,etaEven,eta);
diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h
index 98e8175a..73381351 100644
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -129,18 +129,10 @@ public:
     Runner(S);
   }
 
-  //////////////////////////////////////////////////////////////////
-
-private:
-  template <class SmearingPolicy>
-  void Runner(SmearingPolicy &Smearing) {
-    auto UGrid = Resources.GetCartesian();
-    Resources.AddRNGs();
-    Field U(UGrid);
-
-    // Can move this outside?
-    typedef IntegratorType<SmearingPolicy> TheIntegrator;
-    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
+  void initializeGaugeFieldAndRNGs(Field &U){
+    if(!Resources.haveRNGs()) Resources.AddRNGs();
 
     if (Parameters.StartingType == "HotStart") {
       // Hot start
@@ -167,6 +159,22 @@ private:
 	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
       exit(1);
     }
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////
+
+private:
+  template <class SmearingPolicy>
+  void Runner(SmearingPolicy &Smearing) {
+    auto UGrid = Resources.GetCartesian();
+    Field U(UGrid);
+
+    initializeGaugeFieldAndRNGs(U);
+
+    typedef IntegratorType<SmearingPolicy> TheIntegrator;
+    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
 
     Smearing.set_Field(U);
 
diff --git a/Grid/qcd/hmc/HMCResourceManager.h b/Grid/qcd/hmc/HMCResourceManager.h
index 783e4890..c088cd41 100644
--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@@ -226,6 +226,9 @@ public:
   //////////////////////////////////////////////////////
   // Random number generators
   //////////////////////////////////////////////////////
+  
+  //Return true if the RNG objects have been instantiated
+  bool haveRNGs() const{ return have_RNG; }
 
   void AddRNGs(std::string s = "") {
     // Couple the RNGs to the GridModule tagged by s

From cee6a37639c6716029cbae49356feb505014a937 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Mon, 8 Feb 2021 09:30:35 -0500
Subject: [PATCH 20/41] Added a logging tag for HMC As the integrator logger is
 active by default the cmdline option to activate had no effect. Changed
 option to *de*activate on request ("NoIntegrator") Cleaned up generating
 rational approxs in the general RHMC code As the tolerance of the rational
 approx is not related to the CG tolerance, regenerating approxs for MD and MC
 if they differ only by the CG tolerance is not necessary; this has been fixed
 In DWF+I Gparity evolution code, added cmdline options to check the rational
 approximations and compute the lowest/highest eigenvalues of M^dagM for RHMC
 tuning In the above, changed the integrator layout to a much simpler one that
 completes much faster; may need additional tuning

---
 Grid/algorithms/approx/Chebyshev.h            |   1 +
 Grid/log/Log.cc                               |   5 +-
 Grid/log/Log.h                                |   1 +
 .../GeneralEvenOddRationalRatio.h             |  48 ++--
 Grid/qcd/hmc/HMC.h                            |  28 +-
 Grid/qcd/hmc/integrators/Integrator.h         |   2 +-
 HMC/DWF2p1fIwasakiGparity.cc                  | 266 +++++++++++++-----
 7 files changed, 237 insertions(+), 114 deletions(-)

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index 584ed1d5..a3ddb75f 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -292,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
+
   std::vector<RealD> Coeffs;
   int order;
   RealD alpha;
diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc
index 9302b4cc..31f9a3f3 100644
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
+GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 
 void GridLogConfigure(std::vector<std::string> &logstreams) {
   GridLogError.Active(0);
@@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
   GridLogPerformance.Active(0);
   GridLogIntegrator.Active(1);
   GridLogColours.Active(0);
+  GridLogHMC.Active(1);
 
   for (int i = 0; i < logstreams.size(); i++) {
     if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
     if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
     if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
     if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
     if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
   }
 }
diff --git a/Grid/log/Log.h b/Grid/log/Log.h
index 68693647..b1696fee 100644
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
+extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;
 
 std::string demangle(const char* name) ;
diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index d1aaeb93..ab400ba7 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -79,6 +79,19 @@ NAMESPACE_BEGIN(Grid);
       FermionField PhiEven; // the pseudo fermion field for this trajectory
       FermionField PhiOdd; // the pseudo fermion field for this trajectory
 
+      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
+      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
+      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
+	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
+	double error = remez.generateApprox(approx_degree,1,inv_pow);	
+	if(error > CG_tolerance)
+	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
+	
+	approx.Init(remez, CG_tolerance,false);
+	approx_inv.Init(remez, CG_tolerance,true);
+      }
+
+
     protected:
       static constexpr bool Numerator = true;
       static constexpr bool Denominator = false;
@@ -115,38 +128,14 @@ NAMESPACE_BEGIN(Grid);
 	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
 	AlgRemez remez(param.lo,param.hi,param.precision);
 
-	int inv_pow = param.inv_pow;
-	int _2_inv_pow = 2*inv_pow;
-
 	//Generate approximations for action eval
-
-	// MdagM^(+- 1/inv_pow)
-	std::cout<<GridLogMessage << "Generating degree "<<param.action_degree<<" and tolerance " << param.action_tolerance << " for x^(1/" << inv_pow << ")"<<std::endl;
-	remez.generateApprox(param.action_degree,1,inv_pow);
-	ApproxPowerAction.Init(remez,param.action_tolerance,false);
-	ApproxNegPowerAction.Init(remez,param.action_tolerance,true);
-
-	// VdagV^(+- 1/(2*inv_pow))
-	std::cout<<GridLogMessage << "Generating degree "<<param.action_degree<<" and tolerance " << param.action_tolerance <<" for x^(1/" << _2_inv_pow << ")"<<std::endl;
-	remez.generateApprox(param.action_degree,1,_2_inv_pow);
-   	ApproxHalfPowerAction.Init(remez,param.action_tolerance,false);
-	ApproxNegHalfPowerAction.Init(remez,param.action_tolerance,true);
+	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
+	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
 
 	//Generate approximations for MD
-	if(param.md_degree != param.action_degree || 
-	   param.md_tolerance < param.action_tolerance  //no point in finding less precise polynomial if the degree is the same
-	   ){
-	  // MdagM^(+- 1/inv_pow)
-	  std::cout<<GridLogMessage << "Generating degree "<<param.md_degree<<" and tolerance " << param.md_tolerance <<" for x^(1/" << inv_pow << ")"<<std::endl;
-	  remez.generateApprox(param.md_degree,1,inv_pow);
-	  ApproxPowerMD.Init(remez,param.md_tolerance,false);
-	  ApproxNegPowerMD.Init(remez,param.md_tolerance,true);
-
-	  // VdagV^(+- 1/(2*inv_pow))
-	  std::cout<<GridLogMessage << "Generating degree "<<param.md_degree<<" and tolerance " << param.md_tolerance <<" for x^(1/" << _2_inv_pow << ")"<<std::endl;
-	  remez.generateApprox(param.md_degree,1,_2_inv_pow);
-	  ApproxHalfPowerMD.Init(remez,param.md_tolerance,false);
-	  ApproxNegHalfPowerMD.Init(remez,param.md_tolerance,true);
+	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
+	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
+	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	}else{
 	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
 	  ApproxPowerMD = ApproxPowerAction; 
@@ -156,7 +145,6 @@ NAMESPACE_BEGIN(Grid);
 
 	  ApproxHalfPowerMD = ApproxHalfPowerAction;
 	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
-
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
 	}
diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h
index 9bf5261b..4410cb12 100644
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -115,21 +115,21 @@ private:
 
     random(sRNG, rn_test);
 
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
               << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
+    std::cout << GridLogHMC << "exp(-dH) = " << prob
               << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
               << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
 
     if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC
                 << "--------------------------------------------------\n";
       return true;
     } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC
                 << "--------------------------------------------------\n";
       return false;
     }
@@ -145,7 +145,7 @@ private:
 
     std::streamsize current_precision = std::cout.precision();
     std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
     std::cout.precision(current_precision);
 
     TheIntegrator.integrate(U);
@@ -165,7 +165,7 @@ private:
 
 
     std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+    std::cout << GridLogHMC << "Total H after trajectory  = " << H1
 	      << "  dH = " << H1 - H0 << "\n";
     std::cout.precision(current_precision);
     
@@ -196,9 +196,9 @@ public:
     // Actual updates (evolve a copy Ucopy then copy back eventually)
     unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
     for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
       if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
       }
       
       double t0=usecond();
@@ -210,7 +210,7 @@ public:
       if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
         accept = metropolis_test(DeltaH);
       } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
       }
 
       if (accept)
@@ -219,7 +219,7 @@ public:
      
       
       double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
 
 
       for (int obs = 0; obs < Observables.size(); obs++) {
@@ -228,7 +228,7 @@ public:
       	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
         Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
       }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
     }
   }
 
diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h
index 70055754..2e9fa899 100644
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -125,7 +125,7 @@ protected:
       force = FieldImplementation::projectForce(force); // Ta for gauge fields
       double end_force = usecond();
       Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << " Time step: " << ep << " Impulse average: " << force_abs * ep * HMC_MOMENTUM_DENOMINATOR << std::endl;
       Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
       double end_full = usecond();
       double time_full  = (end_full - start_full) / 1e3;
diff --git a/HMC/DWF2p1fIwasakiGparity.cc b/HMC/DWF2p1fIwasakiGparity.cc
index cfa8f05c..32457278 100644
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -94,6 +94,142 @@ bool fileExists(const std::string &fn){
   return f.good();
 }
 
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr){
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+      
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+}
+
+
+
+
+
+
+
+
+
 int main(int argc, char **argv) {
   Grid_init(&argc, &argv);
   int threads = GridThread::GetThreads();
@@ -152,6 +288,7 @@ int main(int argc, char **argv) {
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
   IntegratorParameters MD;
   typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
   MD.name    = std::string("MinimumNorm2");
   MD.MDsteps = 5; //5 steps of 0.2 for GP* ensembles
   MD.trajL   = 1.0;
@@ -181,7 +318,7 @@ int main(int argc, char **argv) {
   RNGpar.parallel_seeds = "6 7 8 9 10";
   TheHMC.Resources.SetRNGSeeds(RNGpar);
 
-  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   //////////////////////////////////////////////
 
@@ -208,8 +345,7 @@ int main(int argc, char **argv) {
   // temporarily need a gauge field
   LatticeGaugeFieldD Ud(GridPtrD);
   LatticeGaugeFieldF Uf(GridPtrF);
-
-  
+ 
   //Setup the BCs
   FermionActionD::ImplParams Params;
   for(int i=0;i<Nd-1;i++) Params.twists = user_params.GparityDirs[i]; //G-parity directions
@@ -219,64 +355,25 @@ int main(int argc, char **argv) {
   for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
   dirs4[Nd-1] = 0; //periodic gauge BC in time
 
-  ConjugateGimplD::setDirections(dirs4); //gauge BC
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
 
 
   ////////////////////////////////////
   // Collect actions
   ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1); //light quark 
-  ActionLevel<HMCWrapper::Field> Level2(1); //strange quark
-  ActionLevel<HMCWrapper::Field> Level3(8); //gauge (8 increments per step)
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
 
-  ////////////////////////////////////
-  // Strange action
-  ////////////////////////////////////
-
-  //Use same parameters as used for 16GPX ensembles
-  RationalActionParams rat_act_params_s;
-
-  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
-  rat_act_params_s.precision= 60;
-  rat_act_params_s.MaxIter  = 10000;
-  user_params.rat_quo_s.Export(rat_act_params_s);
-
-  //For the 16GPX ensembles we used Hasenbusch mass splitting:
-  //  det[ (M^dag(0.032) M(0.032)) / (M^dag(1.0) M(1.0)) ]^{1/4}    * det[ (M^dag(0.01) M(0.01)) / (M^dag(1.0) M(1.0)) ]^{1/2}
-  //=
-  // [ det[ (M^dag(0.032) M(0.032)) / (M^dag(1.0) M(1.0)) ]^{1/4}  ]^3    * det[ (M^dag(0.01) M(0.01)) / (M^dag(0.032) M(0.032)) ]^{1/2} 
-
-  //I don't know if it's actually necessary for the action objects to be independent instances...
-  int n_hasenbusch_s = 3;
-  std::vector<FermionActionD*> Numerators_sD(n_hasenbusch_s);
-  std::vector<FermionActionD*> Denominators_sD(n_hasenbusch_s);
-  std::vector<FermionActionF*> Numerators_sF(n_hasenbusch_s);
-  std::vector<FermionActionF*> Denominators_sF(n_hasenbusch_s);
-
-  std::vector<MixedPrecRHMC*> Quotients_s(n_hasenbusch_s);
-
-  for(int h=0;h<n_hasenbusch_s;h++){
-    Numerators_sD[h] = new FermionActionD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
-    Denominators_sD[h] = new FermionActionD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,pv_mass,  M5,Params);   
-
-    Numerators_sF[h] = new FermionActionF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
-    Denominators_sF[h] = new FermionActionF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,pv_mass,  M5,Params);   
-
-    //note I define the numerator operator wrt how they appear in the determinant    
-    Quotients_s[h] = new MixedPrecRHMC(*Denominators_sD[h], *Numerators_sD[h], *Denominators_sF[h], *Numerators_sF[h], rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
-    Level2.push_back(Quotients_s[h]);
-  }
 
   /////////////////////////////////////////////////////////////
   // Light action
   /////////////////////////////////////////////////////////////
- 
-  //We don't Hasenbusch the light quark directly, instead the denominator mass is set equal to the strange mass; cf above
+
   FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
-  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, strange_mass,M5,Params);
+  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
 
   FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
-  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, strange_mass,M5,Params);
+  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
 
   RationalActionParams rat_act_params_l;
   rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
@@ -287,36 +384,69 @@ int main(int argc, char **argv) {
   MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
   Level1.push_back(&Quotient_l);
 
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+
+  MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  Level1.push_back(&Quotient_s);  
+
+
   /////////////////////////////////////////////////////////////
   // Gauge action
   /////////////////////////////////////////////////////////////
-  Level3.push_back(&GaugeAction);
+  Level2.push_back(&GaugeAction);
   TheHMC.TheAction.push_back(Level1);
   TheHMC.TheAction.push_back(Level2);
-  TheHMC.TheAction.push_back(Level3);
   std::cout << GridLogMessage << " Action complete "<< std::endl;
 
-  /////////////////////////////////////////////////////////////
-  // HMC parameters are serialisable
-  
-  if(0){
-    TheHMC.Resources.AddRNGs();
-    ConjugateGimplR::HotConfiguration(TheHMC.Resources.GetParallelRNG(), Ud);
-    Quotient_l.refresh(Ud, TheHMC.Resources.GetParallelRNG());    
-    LatticeGaugeFieldD out(Ud);
-    std::cout << GridLogMessage << " Running the derivative "<< std::endl;
-    Quotient_l.deriv(Ud,out);    
-    std::cout << GridLogMessage << " Finished running the derivative "<< std::endl;
-    Numerator_lD.Report();
-    Denominator_lD.Report();
+
+  //Action tuning
+  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
+  std::string lanc_params_l, lanc_params_s;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
+    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
+    else if(sarg == "--eigenrange_l"){
+      assert(i < argc-1);
+      eigenrange_l=true;
+      lanc_params_l = argv[i+1];
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+  }
+  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, MixedPrecRHMC>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, MixedPrecRHMC>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
   }
 
 
-  if(1){
-    std::cout << GridLogMessage << " Running the HMC "<< std::endl;
-    TheHMC.Run();  // no smearing
-  }
-
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
 
   std::cout << GridLogMessage << " Done" << std::endl;
   Grid_finalize();

From daa095c51960230f5964c6283ddedae1f546d61b Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Tue, 9 Feb 2021 12:55:46 -0500
Subject: [PATCH 21/41] Fixed an obscure but reproducible hang in the RHMC
 caused by the bounds check being activated by a random number that wasn't
 synchronized over the nodes HMC now also reports the "L-infinity norm" of the
 impulse, aka the largest site norm

---
 .../action/pseudofermion/GeneralEvenOddRationalRatio.h |  8 +++++++-
 Grid/qcd/hmc/integrators/Integrator.h                  | 10 ++++++++--
 HMC/DWF2p1fIwasakiGparity.cc                           |  9 +++++++--
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index ab400ba7..298a46ca 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -232,13 +232,19 @@ NAMESPACE_BEGIN(Grid);
 	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
 
 	// Randomly apply rational bounds checks.
-	if ( param.BoundsCheckFreq != 0 && (rand()%param.BoundsCheckFreq)==0 ) { 
+	int rcheck = rand();
+	CartesianCommunicator::BroadcastWorld(0,(void *)&rcheck,sizeof(int)); //make sure all nodes have the same number or you will sporadically hang and spend days trying to find out why (trust me - CK)
+
+	if ( param.BoundsCheckFreq != 0 && (rcheck % param.BoundsCheckFreq)==0 ) { 
 	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
 	  HighBoundCheck(MdagM,gauss,param.hi);
+	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
+	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
 	}
 
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h
index 2e9fa899..c7c6529e 100644
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -124,8 +124,14 @@ protected:
       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
       force = FieldImplementation::projectForce(force); // Ta for gauge fields
       double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << " Time step: " << ep << " Impulse average: " << force_abs * ep * HMC_MOMENTUM_DENOMINATOR << std::endl;
+
+      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
+      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      Real max_force_abs = std::sqrt(maxLocalNorm2(force));
+      Real max_impulse_abs = max_force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << " Max force: " << max_force_abs << " Time step: " << ep << " Impulse average: " << impulse_abs << " Max impulse: " << max_impulse_abs << std::endl;
       Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
       double end_full = usecond();
       double time_full  = (end_full - start_full) / 1e3;
diff --git a/HMC/DWF2p1fIwasakiGparity.cc b/HMC/DWF2p1fIwasakiGparity.cc
index 32457278..d9be3fdc 100644
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -284,6 +284,7 @@ int main(int argc, char **argv) {
   typedef typename FermionActionF::FermionField FermionFieldF;
 
   typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
 
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
   IntegratorParameters MD;
@@ -380,8 +381,10 @@ int main(int argc, char **argv) {
   rat_act_params_l.precision= 60;
   rat_act_params_l.MaxIter  = 10000;
   user_params.rat_quo_l.Export(rat_act_params_l);
+  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
  
   MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
+  //DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
   Level1.push_back(&Quotient_l);
 
 
@@ -399,8 +402,10 @@ int main(int argc, char **argv) {
   rat_act_params_s.precision= 60;
   rat_act_params_s.MaxIter  = 10000;
   user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
 
   MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  //DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
   Level1.push_back(&Quotient_s);  
 
 
@@ -435,8 +440,8 @@ int main(int argc, char **argv) {
     TheHMC.initializeGaugeFieldAndRNGs(Ud);
     if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
     if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
-    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, MixedPrecRHMC>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
-    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, MixedPrecRHMC>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
+    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
 
     std::cout << GridLogMessage << " Done" << std::endl;
     Grid_finalize();

From e0f6a146d89f400d879680c1d5f6e467673aae9f Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Tue, 16 Feb 2021 10:41:52 -0500
Subject: [PATCH 22/41] To DWF+I G-parity evolution code, added ability to
 specify number of MD steps in params and an optional usage mode that reads
 the config and checks the plaq/checksum agree then exits

---
 HMC/DWF2p1fIwasakiGparity.cc | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/HMC/DWF2p1fIwasakiGparity.cc b/HMC/DWF2p1fIwasakiGparity.cc
index d9be3fdc..e9865a54 100644
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -72,6 +72,7 @@ struct EvolParameters: Serializable {
                                   Integer, StartTrajectory,
                                   Integer, Trajectories,
 				  Integer, SaveInterval,
+				  Integer, Steps,
                                   bool, MetropolisTest,
 				  std::string, StartingType,
 				  std::vector<Integer>, GparityDirs,
@@ -86,6 +87,7 @@ struct EvolParameters: Serializable {
     SaveInterval = 5;
     StartingType      = "ColdStart";
     GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
   }
 };
 
@@ -237,11 +239,14 @@ int main(int argc, char **argv) {
   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
 
   std::string param_file = "params.xml";
+  bool file_load_check = false;
   for(int i=1;i<argc;i++){
-    if(std::string(argv[i]) == "--param_file"){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
       assert(i!=argc-1);
       param_file = argv[i+1];
-      break;
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
     }
   }
 
@@ -291,7 +296,7 @@ int main(int argc, char **argv) {
   typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
   typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
   MD.name    = std::string("MinimumNorm2");
-  MD.MDsteps = 5; //5 steps of 0.2 for GP* ensembles
+  MD.MDsteps = user_params.Steps;
   MD.trajL   = 1.0;
 
   HMCparameters HMCparams;
@@ -358,6 +363,14 @@ int main(int argc, char **argv) {
 
   GaugeImplPolicy::setDirections(dirs4); //gauge BC
 
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
 
   ////////////////////////////////////
   // Collect actions

From feee5ccde26e07d00e93ba56608861b8d238bdbd Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 3 Mar 2021 15:39:41 -0500
Subject: [PATCH 23/41] Added Gparity flavour Pauli matrix algebra and
 associated tensor types mirroring strategy used for Gamma matrices Added test
 program for the above

---
 Grid/GridQCDcore.h                 |   1 +
 Grid/qcd/QCD.h                     |  25 ++
 Grid/qcd/gparity/Gparity.h         |   6 +
 Grid/qcd/gparity/GparityFlavour.cc |  34 +++
 Grid/qcd/gparity/GparityFlavour.h  | 475 +++++++++++++++++++++++++++++
 tests/core/Test_gparity_flavour.cc | 177 +++++++++++
 6 files changed, 718 insertions(+)
 create mode 100644 Grid/qcd/gparity/Gparity.h
 create mode 100644 Grid/qcd/gparity/GparityFlavour.cc
 create mode 100644 Grid/qcd/gparity/GparityFlavour.h
 create mode 100644 tests/core/Test_gparity_flavour.cc

diff --git a/Grid/GridQCDcore.h b/Grid/GridQCDcore.h
index cae6f43f..065b62cd 100644
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
+#include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index 858aead7..81356a66 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
+#define GparityFlavourIndex (0)
 
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 
+const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
     template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 
 
+template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 
+//G-parity flavour matrix
+typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
+typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
+
+typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
+typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
+
+
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -220,6 +235,16 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
+
+//G-parity flavour vector
+typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
+typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
+typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
+
+typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
+typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
+typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
+
     
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
diff --git a/Grid/qcd/gparity/Gparity.h b/Grid/qcd/gparity/Gparity.h
new file mode 100644
index 00000000..ce1c70eb
--- /dev/null
+++ b/Grid/qcd/gparity/Gparity.h
@@ -0,0 +1,6 @@
+#ifndef GRID_GPARITY_H_
+#define GRID_GPARITY_H_
+
+#include<Grid/qcd/gparity/GparityFlavour.h>
+
+#endif
diff --git a/Grid/qcd/gparity/GparityFlavour.cc b/Grid/qcd/gparity/GparityFlavour.cc
new file mode 100644
index 00000000..4596f96b
--- /dev/null
+++ b/Grid/qcd/gparity/GparityFlavour.cc
@@ -0,0 +1,34 @@
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
+    GparityFlavour(GparityFlavour::Algebra::SigmaX),
+    GparityFlavour(GparityFlavour::Algebra::SigmaY),
+    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
+    }};
+
+const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
+  GparityFlavour(GparityFlavour::Algebra::Identity),
+  GparityFlavour(GparityFlavour::Algebra::SigmaX),
+  GparityFlavour(GparityFlavour::Algebra::SigmaY),
+  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
+  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
+  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
+}};
+
+const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
+    "SigmaX",
+    "MinusSigmaX",
+    "SigmaY",
+    "MinusSigmaY",
+    "SigmaZ",
+    "MinusSigmaZ",
+    "Identity",
+    "MinusIdentity",
+    "ProjPlus",
+    "MinusProjPlus",
+    "ProjMinus",
+    "MinusProjMinus"}};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/gparity/GparityFlavour.h b/Grid/qcd/gparity/GparityFlavour.h
new file mode 100644
index 00000000..b2009235
--- /dev/null
+++ b/Grid/qcd/gparity/GparityFlavour.h
@@ -0,0 +1,475 @@
+#ifndef GRID_QCD_GPARITY_FLAVOUR_H
+#define GRID_QCD_GPARITY_FLAVOUR_H
+
+//Support for flavour-matrix operations acting on the G-parity flavour index
+
+#include <array>
+
+NAMESPACE_BEGIN(Grid);
+
+class GparityFlavour {
+  public:
+    GRID_SERIALIZABLE_ENUM(Algebra, undef,
+                           SigmaX, 0,
+			   MinusSigmaX, 1,
+                           SigmaY, 2,
+			   MinusSigmaY, 3,
+                           SigmaZ, 4,
+			   MinusSigmaZ, 5,
+			   Identity, 6,
+			   MinusIdentity, 7,
+			   ProjPlus, 8,
+			   MinusProjPlus, 9,
+			   ProjMinus, 10,
+			   MinusProjMinus, 11
+			   );
+    static constexpr unsigned int nSigma = 12;
+    static const std::array<const char *, nSigma>                name;
+    static const std::array<const GparityFlavour, 3>             sigma_mu;
+    static const std::array<const GparityFlavour, 6>            sigma_all;
+    Algebra                                                      g;
+  public:
+  accelerator GparityFlavour(Algebra initg): g(initg) {}  
+};
+
+
+
+// 0 1  x   vector
+// 1 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(1);
+  ret(1) = rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(1,0);
+  ret(0,1) = rhs(1,1);
+  ret(1,0) = rhs(0,0);
+  ret(1,1) = rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,1);
+  ret(0,1) = rhs(0,0);
+  ret(1,0) = rhs(1,1);
+  ret(1,1) = rhs(1,0);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(1);
+  ret(1) = -rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(1,0);
+  ret(0,1) = -rhs(1,1);
+  ret(1,0) = -rhs(0,0);
+  ret(1,1) = -rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,1);
+  ret(0,1) = -rhs(0,0);
+  ret(1,0) = -rhs(1,1);
+  ret(1,1) = -rhs(1,0);
+};
+
+
+
+
+
+// 0 -i  x   vector
+// i 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesMinusI(rhs(1));
+  ret(1) = timesI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(1,0));
+  ret(0,1) = timesMinusI(rhs(1,1));
+  ret(1,0) = timesI(rhs(0,0));
+  ret(1,1) = timesI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(0,1));
+  ret(0,1) = timesMinusI(rhs(0,0));
+  ret(1,0) = timesI(rhs(1,1));
+  ret(1,1) = timesMinusI(rhs(1,0));
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesI(rhs(1));
+  ret(1) = timesMinusI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(1,0));
+  ret(0,1) = timesI(rhs(1,1));
+  ret(1,0) = timesMinusI(rhs(0,0));
+  ret(1,1) = timesMinusI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(0,1));
+  ret(0,1) = timesI(rhs(0,0));
+  ret(1,0) = timesMinusI(rhs(1,1));
+  ret(1,1) = timesI(rhs(1,0));
+};
+
+
+
+
+
+// 1 0  x   vector
+// 0 -1
+template<class vtype>
+accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+
+
+
+
+
+template<class vtype>
+accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1+\sigma_2)
+//1 -i
+//i  1
+template<class vtype>
+accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1-\sigma_2)
+//1 i
+//-i  1
+template<class vtype>
+accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+
+
+
+
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
+{
+  iVector<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    multFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    multFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    multFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    multFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    multFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    multFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    multFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    multFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    multFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    multFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    multFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    multFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+ 
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    lmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    lmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    lmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    lmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    lmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    lmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    lmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    lmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    lmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    lmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    lmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    lmultFlavourMinusProjMinus(ret, arg); break;  
+  default: assert(0);
+  }
+  
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    rmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    rmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    rmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    rmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    rmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    rmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    rmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    rmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    rmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    rmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    rmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    rmultFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif // include guard
diff --git a/tests/core/Test_gparity_flavour.cc b/tests/core/Test_gparity_flavour.cc
new file mode 100644
index 00000000..5b204e04
--- /dev/null
+++ b/tests/core/Test_gparity_flavour.cc
@@ -0,0 +1,177 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: ./tests/Test_gparity_flavour.cc
+
+Copyright (C) 2015-2017
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+static constexpr double                      tolerance = 1.0e-6;
+static std::array<GparityFlavourMatrix, GparityFlavour::nSigma> testAlgebra;
+
+void print(const GparityFlavourMatrix &g)
+{
+  for(int i = 0; i < Ngp; i++)
+  {
+    std::cout << GridLogMessage << "(";
+    for(int j=0;j<Ngp;j++){
+      if ( abs( g(i,j)()() ) == 0 ) {
+        std::cout<< " 0";
+      } else if ( abs(g(i,j)()() - Complex(0,1)) == 0){
+        std::cout<< " i";
+      } else if ( abs(g(i,j)()() + Complex(0,1)) == 0){
+        std::cout<< "-i";
+      } else if ( abs(g(i,j)()() - Complex(1,0)) == 0){
+        std::cout<< " 1";
+      } else if ( abs(g(i,j)()() + Complex(1,0)) == 0){
+        std::cout<< "-1";
+      }
+      std::cout<<((j == Ngp-1) ? ")" : "," );
+    }
+    std::cout << std::endl;
+  }
+  std::cout << GridLogMessage << std::endl;
+}
+
+void createTestAlgebra(void)
+{
+  std::array<GparityFlavourMatrix, 3> testg;
+  const Complex             I(0., 1.), mI(0., -1.);
+
+  // 0 1
+  // 1 0
+  testg[0] = Zero();
+  testg[0](0, 1)()() = 1.;
+  testg[0](1, 0)()() = 1.;
+  std::cout << GridLogMessage << "test SigmaX= " << std::endl;
+  print(testg[0]);
+
+  // 0 -i
+  // i  0
+  testg[1] = Zero();
+  testg[1](0, 1)()() = mI;
+  testg[1](1, 0)()() = I;
+  std::cout << GridLogMessage << "test SigmaY= " << std::endl;
+  print(testg[1]);
+
+  // 1  0
+  // 0 -1
+  testg[2] = Zero();
+  testg[2](0, 0)()() = 1.0;
+  testg[2](1, 1)()() = -1.0;
+  std::cout << GridLogMessage << "test SigmaZ= " << std::endl;
+  print(testg[2]);
+
+  
+#define DEFINE_TEST_G(g, exp)\
+testAlgebra[GparityFlavour::Algebra::g]        = exp; \
+testAlgebra[GparityFlavour::Algebra::Minus##g] = -exp;
+  
+  DEFINE_TEST_G(SigmaX      , testg[0]);
+  DEFINE_TEST_G(SigmaY      , testg[1]);
+  DEFINE_TEST_G(SigmaZ      , testg[2]);
+  DEFINE_TEST_G(Identity    , 1.);
+
+  GparityFlavourMatrix pplus;
+  pplus = 1.0;
+  pplus = pplus + testg[1];
+  pplus = pplus * 0.5;
+
+  DEFINE_TEST_G(ProjPlus    , pplus);
+  
+  GparityFlavourMatrix pminus;
+  pminus = 1.0;
+  pminus = pminus - testg[1];
+  pminus = pminus * 0.5;
+
+  DEFINE_TEST_G(ProjMinus    , pminus);
+
+#undef DEFINE_TEST_G
+}
+
+template <typename Expr>
+void test(const Expr &a, const Expr &b)
+{
+  if (norm2(a - b) < tolerance)
+  {
+    std::cout << "[OK] ";
+  }
+  else
+  {
+    std::cout << "[fail]" << std::endl;
+    std::cout << GridLogError << "a= " << a << std::endl;
+    std::cout << GridLogError << "is different (tolerance= " << tolerance << ") from " << std::endl;
+    std::cout << GridLogError << "b= " << b << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+void checkSigma(const GparityFlavour::Algebra a, GridSerialRNG &rng)
+{
+  GparityFlavourVector v;
+  GparityFlavourMatrix m, &testg = testAlgebra[a];
+  GparityFlavour      g(a);
+  
+  random(rng, v);
+  random(rng, m);
+  
+  std::cout << GridLogMessage << "Checking " << GparityFlavour::name[a] << ": ";
+  std::cout << "vecmul ";
+  test(g*v, testg*v);
+  std::cout << "matlmul ";
+  test(g*m, testg*m);
+  std::cout << "matrmul ";
+  test(m*g, m*testg);
+  std::cout << std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+  Grid_init(&argc,&argv);
+  
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+  
+  GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+  GridSerialRNG sRNG;
+  
+  sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  
+  std::cout << GridLogMessage << "======== Test algebra" << std::endl;
+  createTestAlgebra();
+  std::cout << GridLogMessage << "======== Multiplication operators check" << std::endl;
+  for (int i = 0; i < GparityFlavour::nSigma; ++i)
+  {
+    checkSigma(i, sRNG);
+  }
+  std::cout << GridLogMessage << std::endl;
+  
+  Grid_finalize();
+  
+  return EXIT_SUCCESS;
+}

From 351eab02aeedc953a75d486cb12e5a5f1ce2050c Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Mon, 22 Mar 2021 14:39:17 -0400
Subject: [PATCH 24/41] Comment fix

---
 Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index 298a46ca..b6b5e726 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -38,7 +38,7 @@ NAMESPACE_BEGIN(Grid);
     /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
            = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
 	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
-	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{-1/(2*inv_pow)} chi\
+	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
 
 	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
     

From e0e42873c1f5bcbf74839aff17fe27eeb37bc89a Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Wed, 14 Apr 2021 16:41:27 -0400
Subject: [PATCH 25/41] Const correctness for Lattice::Replicate

Adapted GeneralEvenOddRationalRatio and Test_rhmc_EOWilsonRatio_doubleVsMixedPrec to recent changes that require passing in serial RNG

For GeneralEvenOddRationalRatio and TwoFlavourEvenOddRatio, broke refresh into two stages, the first of which generates the random field and the second that computes the pseudofermion field.
This allows derived classes to override the generation of the random field, for example in testing.

Test_dwf_gpforce now uses Gparity in x-direction and APBC in time as opposed to G-parity in time

Added Test_action_dwf_gparity2fvs1f that compares the DWF fermion action with the 2f and the 1f (doubled-lattice) implementations of Gparity
---
 Grid/lattice/Lattice_transfer.h               |   2 +-
 .../GeneralEvenOddRationalRatio.h             |  25 +-
 .../pseudofermion/TwoFlavourEvenOddRatio.h    |  27 +-
 tests/core/Test_gparity.cc                    |   2 +-
 tests/forces/Test_dwf_gpforce.cc              |  18 +-
 tests/hmc/Test_action_dwf_gparity2fvs1f.cc    | 257 ++++++++++++++++++
 ...st_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc |   4 +-
 7 files changed, 294 insertions(+), 41 deletions(-)
 create mode 100644 tests/hmc/Test_action_dwf_gparity2fvs1f.cc

diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index cdb6963e..7adf09d7 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -777,7 +777,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 
 
 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
   typedef typename vobj::scalar_object sobj;
 
diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
index 8412610b..2b08cf49 100644
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -171,7 +171,22 @@ NAMESPACE_BEGIN(Grid);
       //Access the fermion field
       const FermionField &getPhiOdd() const{ return PhiOdd; }
       
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+	FermionField eta(NumOp.FermionGrid());	
+
+	// P(eta) \propto e^{- eta^dag eta}
+	//	
+	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
+	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
+	RealD scale = std::sqrt(0.5);
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	refresh(U,eta);
+      }
+
+      //Allow for manual specification of random field for testing
+      void refresh(const GaugeField &U, const FermionField &eta) {
 
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
@@ -182,18 +197,10 @@ NAMESPACE_BEGIN(Grid);
 	
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 
-	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 
-	// P(eta) \propto e^{- eta^dag eta}
-	//	
-	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
-	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
-	RealD scale = std::sqrt(0.5);
-	gaussian(pRNG,eta);	eta=eta*scale;
-
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 
diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
index da628c75..1ce9445f 100644
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -83,16 +83,7 @@ NAMESPACE_BEGIN(Grid);
 	return sstream.str();
       } 
 
-      
       virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
-
-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
-        //
-        // NumOp == V
-        // DenOp == M
-        //
-        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
-        //
         // P(eta_o) = e^{- eta_o^dag eta_o}
         //
         // e^{x^2/2 sig^2} => sig^2 = 0.5.
@@ -100,12 +91,22 @@ NAMESPACE_BEGIN(Grid);
         RealD scale = std::sqrt(0.5);
 
         FermionField eta    (NumOp.FermionGrid());
+        gaussian(pRNG,eta); eta = eta * scale;
+
+	refresh(U,eta);
+      }
+	
+      void refresh(const GaugeField &U, const FermionField &eta) {
+        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+        //
+        // NumOp == V
+        // DenOp == M
+        //
+        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
         FermionField etaOdd (NumOp.FermionRedBlackGrid());
         FermionField etaEven(NumOp.FermionRedBlackGrid());
         FermionField tmp    (NumOp.FermionRedBlackGrid());
 
-        gaussian(pRNG,eta);
-
         pickCheckerboard(Even,etaEven,eta);
         pickCheckerboard(Odd,etaOdd,eta);
 
@@ -125,8 +126,8 @@ NAMESPACE_BEGIN(Grid);
         DenOp.MooeeDag(etaEven,tmp);
         NumOp.MooeeInvDag(tmp,PhiEven);
 
-        PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
+        //PhiOdd =PhiOdd*scale;
+        //PhiEven=PhiEven*scale;
         
       };
 
diff --git a/tests/core/Test_gparity.cc b/tests/core/Test_gparity.cc
index 026989f1..5bf98ba6 100644
--- a/tests/core/Test_gparity.cc
+++ b/tests/core/Test_gparity.cc
@@ -159,7 +159,7 @@ int main (int argc, char ** argv)
 
   //Coordinate grid for reference
   LatticeInteger    xcoor_1f5(FGrid_1f);
-  LatticeCoordinate(xcoor_1f5,1+nu);
+  LatticeCoordinate(xcoor_1f5,1+nu); //note '1+nu'! This is because for 5D fields the s-direction is direction 0
   Replicate(src,src_1f);
   src_1f   = where( xcoor_1f5 >= Integer(L), 2.0*src_1f,src_1f );
 
diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc
index 1fa1c6e4..9db2c563 100644
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -71,26 +71,14 @@ int main (int argc, char ** argv)
   ////////////////////////////////////
   RealD mass=0.2; //kills the diagonal term
   RealD M5=1.8;
-  //  const int nu = 3;
-  //  std::vector<int> twists(Nd,0); // twists[nu] = 1;
-  //  GparityDomainWallFermionR::ImplParams params;  params.twists = twists;
-  //  GparityDomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 
-  //  DomainWallFermionR Dw     (U,     Grid,RBGrid,mass,M5);
-
-  const int nu = 3;
+  const int nu = 0; //gparity direction
   std::vector<int> twists(Nd,0);
   twists[nu] = 1;
+  twists[Nd-1] = 1; //antiperiodic in time
   GparityDomainWallFermionR::ImplParams params;
   params.twists = twists;
-
-  /*
-  params.boundary_phases[0] = 1.0;
-  params.boundary_phases[1] = 1.0;
-  params.boundary_phases[2] = 1.0;
-  params.boundary_phases[3] =- 1.0;
-  */
-  
+ 
   GparityDomainWallFermionR Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 
   Dw.M   (phi,Mphi);
diff --git a/tests/hmc/Test_action_dwf_gparity2fvs1f.cc b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc
new file mode 100644
index 00000000..830bcead
--- /dev/null
+++ b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc
@@ -0,0 +1,257 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: tests/hmc/Test_action_dwf_gparity2fvs1f.cc
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+
+
+template<typename FermionField2f, typename FermionField1f>
+void copy2fTo1fFermionField(FermionField1f &out, const FermionField2f &in, int gpdir){
+  auto f0_halfgrid = PeekIndex<GparityFlavourIndex>(in,0); //on 2f Grid
+  FermionField1f f0_fullgrid_dbl(out.Grid());
+  Replicate(f0_halfgrid, f0_fullgrid_dbl); //double it up to live on the 1f Grid
+
+  auto f1_halfgrid = PeekIndex<GparityFlavourIndex>(in,1);
+  FermionField1f f1_fullgrid_dbl(out.Grid());
+  Replicate(f1_halfgrid, f1_fullgrid_dbl);
+  
+  const Coordinate &dim_2f = in.Grid()->GlobalDimensions();
+  const Coordinate &dim_1f = out.Grid()->GlobalDimensions();
+
+  //We have to be careful for 5d fields; the s-direction is placed before the x,y,z,t and so we need to shift gpdir by 1
+  std::cout << "gpdir " << gpdir << std::endl;
+
+  gpdir+=1;
+  std::cout << "gpdir for 5D fields " << gpdir << std::endl;
+
+  std::cout << "dim_2f " << dim_2f << std::endl;
+  std::cout << "dim_1f " << dim_1f << std::endl;
+  
+  assert(dim_1f[gpdir] == 2*dim_2f[gpdir]);
+
+  LatticeInteger xcoor_1f(out.Grid()); //5d lattice integer
+  LatticeCoordinate(xcoor_1f,gpdir);
+
+  int L = dim_2f[gpdir];
+
+  out = where(xcoor_1f < L, f0_fullgrid_dbl, f1_fullgrid_dbl);
+}
+
+//Both have the same field type
+void copy2fTo1fGaugeField(LatticeGaugeField &out, const LatticeGaugeField &in, int gpdir){
+  LatticeGaugeField U_dbl(out.Grid());
+  Replicate(in, U_dbl);
+  
+  LatticeGaugeField Uconj_dbl = conjugate( U_dbl );
+
+  const Coordinate &dim_2f = in.Grid()->GlobalDimensions();
+  
+  LatticeInteger xcoor_1f(out.Grid());
+  LatticeCoordinate(xcoor_1f,gpdir);
+
+  int L = dim_2f[gpdir];
+  
+  out = where(xcoor_1f < L, U_dbl, Uconj_dbl);
+}
+
+
+std::ostream & operator<<(std::ostream &os, const Coordinate &x){
+  os << "(";
+  for(int i=0;i<x.size();i++) os << x[i] <<  (i<x.size()-1 ? " " : "");
+  os << ")";
+  return os;
+}
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  int Ls = 16;
+
+  Coordinate latt_2f = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
+  Coordinate mpi_layout = GridDefaultMpi();
+
+  int mu = 0; //Gparity direction
+
+  Coordinate latt_1f = latt_2f;
+  latt_1f[mu] *= 2;
+
+  GridCartesian         * UGrid_1f   = SpaceTimeGrid::makeFourDimGrid(latt_1f, simd_layout, mpi_layout);
+  GridRedBlackCartesian * UrbGrid_1f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_1f);
+  GridCartesian         * FGrid_1f   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_1f);
+  GridRedBlackCartesian * FrbGrid_1f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_1f);
+
+
+  GridCartesian         * UGrid_2f   = SpaceTimeGrid::makeFourDimGrid(latt_2f, simd_layout, mpi_layout);
+  GridRedBlackCartesian * UrbGrid_2f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_2f);
+  GridCartesian         * FGrid_2f   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_2f);
+  GridRedBlackCartesian * FrbGrid_2f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_2f);
+
+
+  std::cout << "SIMD layout " << simd_layout << std::endl;
+  std::cout << "MPI layout " << mpi_layout << std::endl;
+  std::cout << "2f dimensions " << latt_2f << std::endl;
+  std::cout << "1f dimensions " << latt_1f << std::endl;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5_2f(FGrid_2f);  RNG5_2f.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4_2f(UGrid_2f);  RNG4_2f.SeedFixedIntegers(seeds4);
+
+  std::cout << "Generating hot 2f gauge configuration" << std::endl;
+  LatticeGaugeField Umu_2f(UGrid_2f);
+  SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f);
+
+  std::cout << "Copying 2f->1f gauge field" << std::endl;
+  LatticeGaugeField Umu_1f(UGrid_1f);
+  copy2fTo1fGaugeField(Umu_1f, Umu_2f, mu);  
+
+  typedef GparityWilsonImplR FermionImplPolicy2f;
+  typedef GparityDomainWallFermionR FermionAction2f;
+  typedef typename FermionAction2f::FermionField FermionField2f;
+  
+  typedef WilsonImplR FermionImplPolicy1f;
+  typedef DomainWallFermionR FermionAction1f;
+  typedef typename FermionAction1f::FermionField FermionField1f;
+
+  std::cout << "Generating eta 2f" << std::endl;
+  FermionField2f eta_2f(FGrid_2f);
+  gaussian(RNG5_2f, eta_2f);
+
+  RealD scale = std::sqrt(0.5);
+  eta_2f=eta_2f*scale;
+
+  std::cout << "Copying 2f->1f eta" << std::endl;
+  FermionField1f eta_1f(FGrid_1f);
+  copy2fTo1fFermionField(eta_1f, eta_2f, mu);
+  
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.032;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+
+  //Setup the Dirac operators
+  std::cout << "Initializing Dirac operators" << std::endl;
+  
+  FermionAction2f::ImplParams Params_2f;
+  Params_2f.twists[mu] = 1;
+  Params_2f.twists[Nd-1] = 1; //APBC in time direction
+
+  //note 'Num' and 'Den' here refer to the determinant ratio, not the operator ratio in the pseudofermion action where the two are inverted
+  //to my mind the Pauli Villars and 'denominator' are synonymous but the Grid convention has this as the 'Numerator' operator in the RHMC implementation
+  FermionAction2f NumOp_2f(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f, *UrbGrid_2f, light_mass,M5,Params_2f); 
+  FermionAction2f DenOp_2f(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f, *UrbGrid_2f, pv_mass, M5,Params_2f);
+
+  FermionAction1f::ImplParams Params_1f;
+  Params_1f.boundary_phases[mu] = -1; //antiperiodic in doubled lattice in GP direction
+  Params_1f.boundary_phases[Nd-1] = -1;
+  
+  FermionAction1f NumOp_1f(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f, *UrbGrid_1f, light_mass,M5,Params_1f);
+  FermionAction1f DenOp_1f(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f, *UrbGrid_1f, pv_mass, M5,Params_1f);
+
+  //Test the replication routines by running a CG on eta
+  double StoppingCondition = 1e-10;
+  double MaxCGIterations = 30000;
+  ConjugateGradient<FermionField2f>  CG_2f(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField1f>  CG_1f(StoppingCondition,MaxCGIterations);
+
+  NumOp_1f.ImportGauge(Umu_1f);
+  NumOp_2f.ImportGauge(Umu_2f);
+
+  FermionField1f test_1f(FGrid_1f);
+  FermionField2f test_2f(FGrid_2f);
+  
+  MdagMLinearOperator<FermionAction1f, FermionField1f> Linop_1f(NumOp_1f);
+  MdagMLinearOperator<FermionAction2f, FermionField2f> Linop_2f(NumOp_2f);
+  
+  CG_1f(Linop_1f, eta_1f, test_1f);
+  CG_2f(Linop_2f, eta_2f, test_2f);
+  RealD test_1f_norm = norm2(test_1f);
+  RealD test_2f_norm = norm2(test_2f);
+
+  std::cout << "Verification of replication routines: " << test_1f_norm << " " << test_2f_norm << " " << test_1f_norm - test_2f_norm << std::endl;
+
+
+#if 1
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy2f> Action2f;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy1f> Action1f;
+
+  RationalActionParams rational_params;
+  rational_params.inv_pow = 2;
+  rational_params.lo = 1e-5;
+  rational_params.hi = 32;
+  rational_params.md_degree = 16;
+  rational_params.action_degree = 16;
+
+  Action2f action_2f(DenOp_2f, NumOp_2f, rational_params);
+  Action1f action_1f(DenOp_1f, NumOp_1f, rational_params);
+#else
+  typedef TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy2f> Action2f;
+  typedef TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy1f> Action1f;
+
+  Action2f action_2f(DenOp_2f, NumOp_2f, CG_2f, CG_2f);
+  Action1f action_1f(DenOp_1f, NumOp_1f, CG_1f, CG_1f);
+#endif
+
+
+  std::cout << "Action refresh" << std::endl;
+  action_2f.refresh(Umu_2f, eta_2f);
+  action_1f.refresh(Umu_1f, eta_1f);
+
+  std::cout << "Action compute post heatbath" << std::endl;
+  RealD S_2f = action_2f.S(Umu_2f);
+  RealD S_1f = action_1f.S(Umu_1f);
+
+  std::cout << "Action comparison post heatbath" << std::endl;
+  std::cout << S_2f << " " << S_1f << " " << S_2f-S_1f << std::endl;
+
+  //Change the gauge field between refresh and action eval else the matrix and inverse matrices all cancel and we just get |eta|^2
+  SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f);
+  copy2fTo1fGaugeField(Umu_1f, Umu_2f, mu);  
+
+  //Now compute the action with the new gauge field
+  std::cout << "Action compute post gauge field update" << std::endl;
+  S_2f = action_2f.S(Umu_2f);
+  S_1f = action_1f.S(Umu_1f);
+
+  std::cout << "Action comparison post gauge field update" << std::endl;
+  std::cout << S_2f << " " << S_1f << " " << S_2f-S_1f << std::endl;
+
+  Grid_finalize();
+} // main
+
+
diff --git a/tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc b/tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc
index fa14ffa8..53940b50 100644
--- a/tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc
+++ b/tests/hmc/Test_rhmc_EOWilsonRatio_doubleVsMixedPrec.cc
@@ -95,13 +95,13 @@ int main(int argc, char **argv) {
 													NumOpF, DenOpF, 
 													GenParams, 50);
   TheHMC.Resources.GetParallelRNG().SeedUniqueString(seed_string);
-  GenD.refresh(Ud, TheHMC.Resources.GetParallelRNG());    
+  GenD.refresh(Ud, TheHMC.Resources.GetSerialRNG(), TheHMC.Resources.GetParallelRNG());    
   RealD Sd = GenD.S(Ud);
   LatticeGaugeField derivD(Ud);
   GenD.deriv(Ud,derivD);   
 
   TheHMC.Resources.GetParallelRNG().SeedUniqueString(seed_string);
-  GenFD.refresh(Ud, TheHMC.Resources.GetParallelRNG());    
+  GenFD.refresh(Ud, TheHMC.Resources.GetSerialRNG(), TheHMC.Resources.GetParallelRNG());    
   RealD Sfd = GenFD.S(Ud);
   LatticeGaugeField derivFD(Ud);
   GenFD.deriv(Ud,derivFD);   

From 0e959d9b94551351dd8213beba19888d6dda54b3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 22 Apr 2021 15:55:47 -0400
Subject: [PATCH 26/41] Update plaquette analysis

---
 scripts/hmc.sh | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/scripts/hmc.sh b/scripts/hmc.sh
index c2cd6593..4d7191ff 100755
--- a/scripts/hmc.sh
+++ b/scripts/hmc.sh
@@ -1,19 +1,27 @@
 #!/bin/bash
 
 LOG=$1
-SWEEPS=`grep dH $LOG | wc -l`
-SWEEPS=`expr $SWEEPS - 80`
+SWEEPS=`grep dH.= $LOG | wc -l`
+SWEEPS=`expr $SWEEPS - 100`
 echo
 echo $SWEEPS thermalised sweeps
 echo
-plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10} END { print S/NR} ' `
-plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+plaq=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12} END { print S/NR} ' `
+plaqe=`grep Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12 ; SS=SS+$12*$12 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
 echo "Plaquette: $plaq (${plaqe})"
 echo
 
-dHv=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+$10 ; SS=SS+$10*$10 } END { print sqrt(SS/NR) } ' `
-edH=`grep dH $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$10)} END { print S/NR} '`
-echo "<e-dH>: $edH"
+grep  Plaq $LOG | tail -n $SWEEPS | awk '{ S=S+$12/20; if(NR%20==0){ print NR/20, " ", S; S=0;} } '  > plaq.binned
+
+plaq=`cat plaq.binned  | awk '{ S=S+$2} END { print S/NR} ' `
+plaqe=`cat plaq.binned | awk '{ S=S+$2 ; SS=SS+$2*$2 } END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } ' `
+echo "Binned Plaquette: $plaq (${plaqe})"
+echo
+
+dHv=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+$16 ; SS=SS+$16*$16 } END { print sqrt(SS/NR) } ' `
+edH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16)} END { print S/NR} '`
+dedH=`grep dH.= $LOG | tail -n $SWEEPS | awk '{ S=S+exp(-$16); SS=SS+exp(-$16)*exp(-$16)} END { print sqrt( (SS/NR - S*S/NR/NR)/NR) } '`
+echo "<e-dH>: $edH (${dedH})"
 echo "<rms dH>: $dHv"
 
 TRAJ=`grep Acc $LOG | wc -l`
@@ -22,12 +30,13 @@ PACC=`expr  100 \* ${ACC} / ${TRAJ} `
 echo
 echo "Acceptance $PACC %  $ACC / $TRAJ "
 
-grep Plaq $LOG | awk '{ print $10 }' | uniq > plaq.dat
-grep dH $LOG | awk '{ print $10 }' > dH.dat
-echo set yrange [-0.2:1.0] > plot.gnu
+grep Plaq $LOG | awk '{ print $12 }' | uniq > plaq.dat
+grep dH.= $LOG | awk '{ print $16 }' > dH.dat
+echo set yrange [0.58:0.60] > plot.gnu
 echo set terminal 'pdf' >> plot.gnu
+echo "f(x) =0.588" >> plot.gnu
 echo "set output 'plaq.${LOG}.pdf'" >> plot.gnu
-echo "plot 'plaq.dat' w l, 'dH.dat' w l " >> plot.gnu
+echo "plot 'plaq.dat' w l, f(x) " >> plot.gnu
 echo
 gnuplot plot.gnu >& gnu.errs
 open plaq.${LOG}.pdf

From 0d9aa8722845c5b240445d93b009c2e9aa134dfd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 22 Apr 2021 15:56:59 -0400
Subject: [PATCH 27/41] Reduce momentum to the GP plane

---
 tests/forces/Test_gpwilson_force.cc | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc
index d731f27a..65bd865e 100644
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -64,7 +64,9 @@ int main (int argc, char ** argv)
   ////////////////////////////////////
   RealD mass=0.01; 
 
-  const int nu = 3;
+  const int nu = 1;
+  const int Lnu=latt_size[nu];
+
   std::vector<int> twists(Nd,0);  twists[nu] = 1;
   GparityWilsonFermionR::ImplParams params;  params.twists = twists;
   GparityWilsonFermionR Wil(U,*UGrid,*UrbGrid,mass,params);
@@ -84,20 +86,31 @@ int main (int argc, char ** argv)
   ////////////////////////////////////
   // Modify the gauge field a little 
   ////////////////////////////////////
-  RealD dt = 0.01;
+  RealD dt = 0.1;
 
   LatticeColourMatrix mommu(UGrid); 
+  LatticeColourMatrix zz(UGrid);
   LatticeColourMatrix forcemu(UGrid); 
   LatticeGaugeField mom(UGrid); 
   LatticeGaugeField Uprime(UGrid); 
 
+  
+  Lattice<iScalar<vInteger> > coor(UGrid);
+  LatticeCoordinate(coor,nu);
+  zz=Zero();
   for(int mu=0;mu<Nd;mu++){
 
     // Traceless antihermitian momentum; gaussian in lie alg
-    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); 
-
+    if(mu==nu){
+      SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); 
+      mommu=where(coor==Lnu-1,mommu,zz);
+      //mommu=where(coor==0,mommu,zz);
+      //      mommu=where(coor==1,mommu,zz);
+    } else {
+      mommu=Zero();
+    }
     PokeIndex<LorentzIndex>(mom,mommu,mu);
-
+    
     // fourth order exponential approx
     autoView( mom_v, mom, CpuRead);
     autoView( U_v , U, CpuRead);
@@ -139,12 +152,14 @@ int main (int argc, char ** argv)
     // Update PF action density
     dS = dS+trace(mommu*forcemu)*dt;
   }
-
+  std::cout << "mommu"<<mommu<<std::endl;
+  std::cout << "dS" << dS<<std::endl;
+  
   ComplexD dSpred    = sum(dS);
 
   std::cout << GridLogMessage << " S      "<<S<<std::endl;
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
-  std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
+  std::cout << GridLogMessage << "Delta S "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
 
   assert( fabs(real(Sprime-S-dSpred)) < 2.0 ) ;

From 38964a4076d892bde29afd7ce3f56063480d2a9a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 22 Apr 2021 15:57:37 -0400
Subject: [PATCH 28/41] Switch twist direction

---
 tests/hmc/Test_hmc_GparityIwasakiGauge.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc
index 7f74d5d8..d4bfa0a5 100644
--- a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc
+++ b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc
@@ -58,7 +58,7 @@ int main(int argc, char **argv) {
   CheckpointerParameters CPparams;  
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix = "ckpoint_EODWF_rng";
-  CPparams.saveInterval = 5;
+  CPparams.saveInterval = 1;
   CPparams.format = "IEEE64BIG";
   
   TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@@ -79,7 +79,7 @@ int main(int argc, char **argv) {
   // that have a complex construction
   // standard
   RealD beta = 2.6 ;
-  const int nu = 3;
+  const int nu = 1;
   std::vector<int> twists(Nd,0);
   twists[nu] = 1;
   ConjugateGimplD::setDirections(twists);

From 3a4f5f232420f6ff356c5c4dad84f8c5bb5e3858 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 22 Apr 2021 18:54:00 -0400
Subject: [PATCH 29/41] Merge develop, strengthen force tests

---
 tests/forces/Test_gpdwf_force.cc    | 28 ++++++++++++++++++++++++----
 tests/forces/Test_gpwilson_force.cc | 24 +++++++++++++++---------
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc
index d6744080..b1c0dd80 100644
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@@ -71,8 +71,10 @@ int main (int argc, char ** argv)
   RealD mass=0.01; 
   RealD M5=1.8; 
 
-  const int nu = 3;
-  std::vector<int> twists(Nd,0);  twists[nu] = 1;
+  const int nu = 1;
+  std::vector<int> twists(Nd,0);
+  twists[nu] = 1;
+  twists[3] = 1;
   GparityDomainWallFermionR::ImplParams params;  params.twists = twists;
   GparityDomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
   Ddwf.M   (phi,Mphi);
@@ -91,16 +93,28 @@ int main (int argc, char ** argv)
   ////////////////////////////////////
   // Modify the gauge field a little 
   ////////////////////////////////////
-  RealD dt = 0.0001;
+  RealD dt = 0.01;
 
+  LatticeColourMatrix zz(UGrid); zz=Zero();
   LatticeColourMatrix mommu(UGrid); 
   LatticeColourMatrix forcemu(UGrid); 
   LatticeGaugeField mom(UGrid); 
   LatticeGaugeField Uprime(UGrid); 
 
+  const int Lnu=latt_size[nu];
+  Lattice<iScalar<vInteger> > coor(UGrid);
+  LatticeCoordinate(coor,nu);
   for(int mu=0;mu<Nd;mu++){
 
-    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+    // Traceless antihermitian momentum; gaussian in lie alg
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu);
+    if(0){
+      if(mu==nu){
+	mommu=where(coor==Lnu-1,mommu,zz);
+      } else {
+	mommu=Zero();
+      }
+    }
 
     PokeIndex<LorentzIndex>(mom,mommu,mu);
 
@@ -125,6 +139,12 @@ int main (int argc, char ** argv)
 
   ComplexD Sprime    = innerProduct(MphiPrime   ,MphiPrime);
 
+
+  LatticeComplex lip(FGrid); lip=localInnerProduct(Mphi,Mphi);
+  LatticeComplex lipp(FGrid); lipp=localInnerProduct(MphiPrime,MphiPrime);
+  LatticeComplex dip(FGrid); dip = lipp - lip;
+  std::cout << " dip "<<dip<<std::endl;
+  
   //////////////////////////////////////////////
   // Use derivative to estimate dS
   //////////////////////////////////////////////
diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc
index 65bd865e..b0b41676 100644
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -67,7 +67,9 @@ int main (int argc, char ** argv)
   const int nu = 1;
   const int Lnu=latt_size[nu];
 
-  std::vector<int> twists(Nd,0);  twists[nu] = 1;
+  std::vector<int> twists(Nd,0);
+  twists[nu] = 1;
+  twists[3]=1;
   GparityWilsonFermionR::ImplParams params;  params.twists = twists;
   GparityWilsonFermionR Wil(U,*UGrid,*UrbGrid,mass,params);
   Wil.M   (phi,Mphi);
@@ -86,7 +88,7 @@ int main (int argc, char ** argv)
   ////////////////////////////////////
   // Modify the gauge field a little 
   ////////////////////////////////////
-  RealD dt = 0.1;
+  RealD dt = 0.01;
 
   LatticeColourMatrix mommu(UGrid); 
   LatticeColourMatrix zz(UGrid);
@@ -101,13 +103,13 @@ int main (int argc, char ** argv)
   for(int mu=0;mu<Nd;mu++){
 
     // Traceless antihermitian momentum; gaussian in lie alg
-    if(mu==nu){
-      SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); 
-      mommu=where(coor==Lnu-1,mommu,zz);
-      //mommu=where(coor==0,mommu,zz);
-      //      mommu=where(coor==1,mommu,zz);
-    } else {
-      mommu=Zero();
+    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu);
+    if(0){
+      if(mu==nu){
+	mommu=where(coor==Lnu-1,mommu,zz);
+      } else {
+	mommu=Zero();
+      }
     }
     PokeIndex<LorentzIndex>(mom,mommu,mu);
     
@@ -143,6 +145,10 @@ int main (int argc, char ** argv)
     mommu=Ta(mommu)*2.0;
     PokeIndex<LorentzIndex>(UdSdU,mommu,mu);
   }
+  LatticeComplex lip(UGrid); lip=localInnerProduct(Mphi,Mphi);
+  LatticeComplex lipp(UGrid); lipp=localInnerProduct(MphiPrime,MphiPrime);
+  LatticeComplex dip(UGrid); dip = lipp - lip;
+  std::cout << " dip "<<dip<<std::endl;
 
   LatticeComplex dS(UGrid); dS = Zero();
   for(int mu=0;mu<Nd;mu++){

From 0f08364e4fd5cf6d205ceb05a1dc1b23ae344461 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 26 Apr 2021 23:18:11 +0200
Subject: [PATCH 30/41] Mom filter refresh sRNG

---
 tests/forces/Test_momentum_filter.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/forces/Test_momentum_filter.cc b/tests/forces/Test_momentum_filter.cc
index 856ea0f2..794b5fa0 100644
--- a/tests/forces/Test_momentum_filter.cc
+++ b/tests/forces/Test_momentum_filter.cc
@@ -61,7 +61,9 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
 
   GridParallelRNG          pRNG(&Grid);
+  GridSerialRNG            sRNG;
   pRNG.SeedFixedIntegers(seeds);
+  sRNG.SeedFixedIntegers(seeds);
 
   typedef PeriodicGimplR Gimpl;
   typedef WilsonGaugeAction<Gimpl> GaugeAction;
@@ -115,7 +117,7 @@ int main (int argc, char ** argv)
   
   integrator.setMomentumFilter(filter);
 
-  integrator.refresh(U, pRNG); //doesn't actually change the gauge field
+  integrator.refresh(U, sRNG, pRNG); //doesn't actually change the gauge field
 
   //Check the momentum is zero on the boundary
   const auto &P = integrator.getMomentum();

From 29ddafd0fcf8480d52f0955efd67d4dd54344201 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Fri, 30 Apr 2021 13:12:24 -0400
Subject: [PATCH 31/41] Added variant of G-parity DWF+I ensemble gen code using
 double prec RHMC

---
 HMC/DWF2p1fIwasakiGparityRHMCdouble.cc | 473 +++++++++++++++++++++++++
 1 file changed, 473 insertions(+)
 create mode 100644 HMC/DWF2p1fIwasakiGparityRHMCdouble.cc

diff --git a/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
new file mode 100644
index 00000000..533ad8a2
--- /dev/null
+++ b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
@@ -0,0 +1,473 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/DWF2p1fIwasakiGparity.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//2+1f DWF+I ensemble with G-parity BCs
+//designed to reproduce ensembles in https://arxiv.org/pdf/1908.08640.pdf
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  RatQuoParameters, rat_quo_l,
+				  RatQuoParameters, rat_quo_s);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr){
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+      
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+  std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+  InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+  std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+  std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+}
+
+
+
+
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+    }
+  }
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "Params", user_params);
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+   // Typedefs to simplify notation
+  typedef GparityDomainWallFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityDomainWallFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 16;
+  Real beta         = 2.13;
+  Real light_mass   = 0.01;
+  Real strange_mass = 0.032;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+
+  //Setup the Grids
+  auto GridPtrD   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtrD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrD);
+
+  GridCartesian* GridPtrF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(GridPtrD);
+  LatticeGaugeFieldF Uf(GridPtrF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(8); //gauge (8 increments per step)
+
+
+  /////////////////////////////////////////////////////////////
+  // Light action
+  /////////////////////////////////////////////////////////////
+
+  FermionActionD Numerator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, light_mass,M5,Params);
+  FermionActionD Denominator_lD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
+
+  FermionActionF Numerator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, light_mass,M5,Params);
+  FermionActionF Denominator_lF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
+
+  RationalActionParams rat_act_params_l;
+  rat_act_params_l.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_l.precision= 60;
+  rat_act_params_l.MaxIter  = 10000;
+  user_params.rat_quo_l.Export(rat_act_params_l);
+  std::cout << GridLogMessage << " Light quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+ 
+  //MixedPrecRHMC Quotient_l(Denominator_lD, Numerator_lD, Denominator_lF, Numerator_lF, rat_act_params_l, user_params.rat_quo_l.reliable_update_freq);
+  DoublePrecRHMC Quotient_l(Denominator_lD, Numerator_lD, rat_act_params_l);
+  Level1.push_back(&Quotient_l);
+
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD,strange_mass,M5,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*GridPtrD,*GridRBPtrD, pv_mass,M5,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF, pv_mass,M5,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_l.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level2.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool tune_rhmc_l=false, tune_rhmc_s=false, eigenrange_l=false, eigenrange_s=false; 
+  std::string lanc_params_l, lanc_params_s;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_l") tune_rhmc_l=true;
+    else if(sarg == "--tune_rhmc_s") tune_rhmc_s=true;
+    else if(sarg == "--eigenrange_l"){
+      assert(i < argc-1);
+      eigenrange_l=true;
+      lanc_params_l = argv[i+1];
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+  }
+  if(tune_rhmc_l || tune_rhmc_s || eigenrange_l || eigenrange_s){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    if(eigenrange_l) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_l, FGridD, FrbGridD, Ud, Numerator_lD, TheHMC.Resources.GetParallelRNG());
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_l) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_l)>(FGridD, FrbGridD, Ud, Numerator_lD, Denominator_lD, Quotient_l, TheHMC.Resources.GetParallelRNG(), 2, "light");
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange");
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
+

From 80176b1b394e343ade8baf4e879caf6ebe906739 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@bnl.gov>
Date: Tue, 4 May 2021 13:12:23 -0400
Subject: [PATCH 32/41] RHMC now outputs some initial norms to the logs Fixed
 DWF+I Gparity binaries not correctly assigning twist directions (thanks
 Peter!)

---
 Grid/algorithms/iterative/ConjugateGradientMultiShift.h | 3 +++
 HMC/DWF2p1fIwasakiGparity.cc                            | 2 +-
 HMC/DWF2p1fIwasakiGparityRHMCdouble.cc                  | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
index 252e4506..3b079e99 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -182,6 +182,9 @@ public:
     for(int s=0;s<nshift;s++) {
       axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
     }
+
+    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
+    
   
   ///////////////////////////////////////
   // Timers
diff --git a/HMC/DWF2p1fIwasakiGparity.cc b/HMC/DWF2p1fIwasakiGparity.cc
index e9865a54..c02ada51 100644
--- a/HMC/DWF2p1fIwasakiGparity.cc
+++ b/HMC/DWF2p1fIwasakiGparity.cc
@@ -354,7 +354,7 @@ int main(int argc, char **argv) {
  
   //Setup the BCs
   FermionActionD::ImplParams Params;
-  for(int i=0;i<Nd-1;i++) Params.twists = user_params.GparityDirs[i]; //G-parity directions
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
   Params.twists[Nd-1] = 1; //APBC in time direction
 
   std::vector<int> dirs4(Nd);
diff --git a/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
index 533ad8a2..58295d5e 100644
--- a/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
+++ b/HMC/DWF2p1fIwasakiGparityRHMCdouble.cc
@@ -354,7 +354,7 @@ int main(int argc, char **argv) {
  
   //Setup the BCs
   FermionActionD::ImplParams Params;
-  for(int i=0;i<Nd-1;i++) Params.twists = user_params.GparityDirs[i]; //G-parity directions
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
   Params.twists[Nd-1] = 1; //APBC in time direction
 
   std::vector<int> dirs4(Nd);

From 53d226924a2862fc52603ca293ace658d0fe12a8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 4 May 2021 13:44:07 -0400
Subject: [PATCH 33/41] CRC added

---
 Grid/lattice/Lattice.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h
index 9f5f1da7..c4adf86a 100644
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
+#include <Grid/lattice/Lattice_crc.h>

From 86948c6ea0f3f6ea5e38d62c03832ebeb60ff291 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 4 May 2021 13:44:38 -0400
Subject: [PATCH 34/41] CRC for finger print fields - aids debug / version diff

---
 Grid/lattice/Lattice_crc.h | 42 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 Grid/lattice/Lattice_crc.h

diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h
new file mode 100644
index 00000000..a283ea8b
--- /dev/null
+++ b/Grid/lattice/Lattice_crc.h
@@ -0,0 +1,42 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_crc.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+{
+  autoView( buf_v , buf, CpuRead);
+  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
+}
+
+#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+
+NAMESPACE_END(Grid);
+
+

From 7f6e2ee03efcfe0b2973c0a99bca3f52ce4a4198 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 5 May 2021 17:34:17 -0400
Subject: [PATCH 35/41] Drop normal_distribution, standardise

---
 Grid/lattice/Lattice_rng.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h
index e5e63716..5b66ff34 100644
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -32,8 +32,9 @@
 #include <random>
 
 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
+#include <Grid/random/gaussian.h>
 
 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -142,8 +143,8 @@ public:
 
   std::vector<RngEngine>                             _generators;
   std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
-  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  std::vector<Grid::gaussian_distribution<RealD> >       _gaussian;
+  //  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
   std::vector<std::uniform_int_distribution<uint32_t> > _uid;
 
   ///////////////////////
@@ -243,8 +244,8 @@ public:
   GridSerialRNG() : GridRNGbase() {
     _generators.resize(1);
     _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
     _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
   }
 
@@ -357,8 +358,8 @@ public:
 
     _generators.resize(_vol);
     _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
     _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
   }
 
@@ -515,11 +516,11 @@ public:
 
 template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
 template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
-template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
 
 template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
 template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
-template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
 
 NAMESPACE_END(Grid);
 #endif

From 8637a9512a151656ee7ef395eddabbabe9a80a40 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 5 May 2021 17:34:54 -0400
Subject: [PATCH 36/41] Freeze Gaussian implementation

---
 Grid/random/gaussian.h | 200 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 Grid/random/gaussian.h

diff --git a/Grid/random/gaussian.h b/Grid/random/gaussian.h
new file mode 100644
index 00000000..4b15baa2
--- /dev/null
+++ b/Grid/random/gaussian.h
@@ -0,0 +1,200 @@
+// -*- C++ -*-
+//===--------------------------- random -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Peter Boyle: Taken from libc++ in Clang/LLVM.
+// Reason is that libstdc++ and clang differ in their return order in the normal_distribution / box mueller type step.
+// standardise on one and call it "gaussian_distribution".
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cmath>
+#include <type_traits>
+#include <initializer_list>
+#include <limits>
+#include <algorithm>
+#include <numeric>
+#include <vector>
+#include <string>
+#include <istream>
+#include <ostream>
+#include <random>
+
+// normal_distribution -> gaussian distribution
+namespace Grid {
+
+template<class _RealType = double>
+class  gaussian_distribution
+{
+public:
+    // types
+    typedef _RealType result_type;
+
+    class param_type
+    {
+        result_type __mean_;
+        result_type __stddev_;
+    public:
+        typedef gaussian_distribution distribution_type;
+
+        strong_inline
+        explicit param_type(result_type __mean = 0, result_type __stddev = 1)
+            : __mean_(__mean), __stddev_(__stddev) {}
+
+        strong_inline
+        result_type mean() const {return __mean_;}
+        strong_inline
+        result_type stddev() const {return __stddev_;}
+
+        friend strong_inline
+            bool operator==(const param_type& __x, const param_type& __y)
+            {return __x.__mean_ == __y.__mean_ && __x.__stddev_ == __y.__stddev_;}
+        friend strong_inline
+            bool operator!=(const param_type& __x, const param_type& __y)
+            {return !(__x == __y);}
+    };
+
+private:
+    param_type __p_;
+    result_type _V_;
+    bool _V_hot_;
+
+public:
+    // constructors and reset functions
+    strong_inline
+    explicit gaussian_distribution(result_type __mean = 0, result_type __stddev = 1)
+        : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
+    strong_inline
+    explicit gaussian_distribution(const param_type& __p)
+        : __p_(__p), _V_hot_(false) {}
+    strong_inline
+    void reset() {_V_hot_ = false;}
+
+    // generating functions
+    template<class _URNG>
+        strong_inline
+        result_type operator()(_URNG& __g)
+        {return (*this)(__g, __p_);}
+    template<class _URNG> result_type operator()(_URNG& __g, const param_type& __p);
+
+    // property functions
+    strong_inline
+    result_type mean() const {return __p_.mean();}
+    strong_inline
+    result_type stddev() const {return __p_.stddev();}
+
+    strong_inline
+    param_type param() const {return __p_;}
+    strong_inline
+    void param(const param_type& __p) {__p_ = __p;}
+
+    strong_inline
+    result_type min() const {return -std::numeric_limits<result_type>::infinity();}
+    strong_inline
+    result_type max() const {return std::numeric_limits<result_type>::infinity();}
+
+    friend strong_inline
+        bool operator==(const gaussian_distribution& __x,
+                        const gaussian_distribution& __y)
+        {return __x.__p_ == __y.__p_ && __x._V_hot_ == __y._V_hot_ &&
+                (!__x._V_hot_ || __x._V_ == __y._V_);}
+    friend strong_inline
+        bool operator!=(const gaussian_distribution& __x,
+                        const gaussian_distribution& __y)
+        {return !(__x == __y);}
+
+    template <class _CharT, class _Traits, class _RT>
+    friend
+    std::basic_ostream<_CharT, _Traits>&
+    operator<<(std::basic_ostream<_CharT, _Traits>& __os,
+               const gaussian_distribution<_RT>& __x);
+
+    template <class _CharT, class _Traits, class _RT>
+    friend
+    std::basic_istream<_CharT, _Traits>&
+    operator>>(std::basic_istream<_CharT, _Traits>& __is,
+               gaussian_distribution<_RT>& __x);
+};
+
+template <class _RealType>
+template<class _URNG>
+_RealType
+gaussian_distribution<_RealType>::operator()(_URNG& __g, const param_type& __p)
+{
+    result_type _Up;
+    if (_V_hot_)
+    {
+        _V_hot_ = false;
+        _Up = _V_;
+    }
+    else
+    {
+        std::uniform_real_distribution<result_type> _Uni(-1, 1);
+        result_type __u;
+        result_type __v;
+        result_type __s;
+        do
+        {
+            __u = _Uni(__g);
+            __v = _Uni(__g);
+            __s = __u * __u + __v * __v;
+        } while (__s > 1 || __s == 0);
+        result_type _Fp = _VSTD::sqrt(-2 * _VSTD::log(__s) / __s);
+        _V_ = __v * _Fp;
+        _V_hot_ = true;
+        _Up = __u * _Fp;
+    }
+    return _Up * __p.stddev() + __p.mean();
+}
+
+template <class _CharT, class _Traits, class _RT>
+std::basic_ostream<_CharT, _Traits>&
+operator<<(std::basic_ostream<_CharT, _Traits>& __os,
+           const gaussian_distribution<_RT>& __x)
+{
+    auto __save_flags = __os.flags();
+    __os.flags(std::ios_base::dec | std::ios_base::left | std::ios_base::fixed |
+               std::ios_base::scientific);
+    _CharT __sp = __os.widen(' ');
+    __os.fill(__sp);
+    __os << __x.mean() << __sp << __x.stddev() << __sp << __x._V_hot_;
+    if (__x._V_hot_)
+        __os << __sp << __x._V_;
+    __os.flags(__save_flags);
+    return __os;
+}
+
+template <class _CharT, class _Traits, class _RT>
+std::basic_istream<_CharT, _Traits>&
+operator>>(std::basic_istream<_CharT, _Traits>& __is,
+           gaussian_distribution<_RT>& __x)
+{
+    typedef gaussian_distribution<_RT> _Eng;
+    typedef typename _Eng::result_type result_type;
+    typedef typename _Eng::param_type param_type;
+    auto __save_flags = __is.flags();
+    __is.flags(std::ios_base::dec | std::ios_base::skipws);
+    result_type __mean;
+    result_type __stddev;
+    result_type _Vp = 0;
+    bool _V_hot = false;
+    __is >> __mean >> __stddev >> _V_hot;
+    if (_V_hot)
+        __is >> _Vp;
+    if (!__is.fail())
+    {
+        __x.param(param_type(__mean, __stddev));
+        __x._V_hot_ = _V_hot;
+        __x._V_ = _Vp;
+    }
+    __is.flags(__save_flags);
+    return __is;
+}
+}

From 3b734ee397d94c4aaa3c64cd51336ea440047426 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 5 May 2021 17:36:19 -0400
Subject: [PATCH 37/41] two point function example

---
 tests/core/Test_fft.cc | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index 212b1a35..4ae0c23b 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -299,12 +299,12 @@ int main (int argc, char ** argv)
     SpinColourVectorD ferm; gaussian(sRNG,ferm);
     pokeSite(ferm,src,point);
 
-    const int Ls=32;
+    const int Ls=64;
     GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
 
-    RealD mass=0.01;
-    RealD M5  =0.8;
+    RealD mass=1.0;
+    RealD M5  =0.99;
     DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5);
 
     // Momentum space prop
@@ -353,6 +353,12 @@ int main (int argc, char ** argv)
     std::cout << " Taking difference" <<std::endl;
     std::cout << "Ddwf result4 "<<norm2(result4)<<std::endl;
     std::cout << "Ddwf ref     "<<norm2(ref)<<std::endl;
+    auto twopoint = localInnerProduct(result4,result4);
+    std::vector<TComplex> pion_prop;
+    sliceSum(twopoint,pion_prop,Nd-1);
+    for(int t=0;t<pion_prop.size();t++){
+      std::cout << "Pion_prop["<<t<<"]="<<pion_prop[t]<<std::endl;
+    }
     
     diff = ref - result4;
     std::cout << "result - ref     "<<norm2(diff)<<std::endl;
@@ -383,7 +389,7 @@ int main (int argc, char ** argv)
     GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,&GRID);
     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,&GRID);
 
-    RealD mass=0.01;
+    RealD mass=1.0;
     RealD M5  =0.8;
 
     OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);

From c2ee2b5fd1e4edc101cdd63b1fa2bfd50939d92c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 5 May 2021 17:36:38 -0400
Subject: [PATCH 38/41] Random chhanges

---
 Grid/{sitmo_rng => random}/README                | 0
 Grid/{sitmo_rng => random}/sitmo_prng_engine.hpp | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename Grid/{sitmo_rng => random}/README (100%)
 rename Grid/{sitmo_rng => random}/sitmo_prng_engine.hpp (100%)

diff --git a/Grid/sitmo_rng/README b/Grid/random/README
similarity index 100%
rename from Grid/sitmo_rng/README
rename to Grid/random/README
diff --git a/Grid/sitmo_rng/sitmo_prng_engine.hpp b/Grid/random/sitmo_prng_engine.hpp
similarity index 100%
rename from Grid/sitmo_rng/sitmo_prng_engine.hpp
rename to Grid/random/sitmo_prng_engine.hpp

From f1b8ba45e7076db3555e07a74b62e98e2f2c48b8 Mon Sep 17 00:00:00 2001
From: Quadro <Peter Boyle>
Date: Wed, 5 May 2021 19:54:21 -0400
Subject: [PATCH 39/41] Warning on GCC suppress unrelated to my code so why
 doesn't it shut up about its ABI fix

---
 Grid/DisableWarnings.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h
index 4bd1edd0..a725fc76 100644
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -34,6 +34,9 @@ directory
 
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+#if defined __GNUC__ 
+#pragma GCC diagnostic ignored "-Wpsabi"
 #endif
 
  //disables and intel compiler specific warning (in json.hpp)

From f0e9a5299f7750f8afb159857d730fc5c3510cd6 Mon Sep 17 00:00:00 2001
From: Quadro <Peter Boyle>
Date: Wed, 5 May 2021 19:55:34 -0400
Subject: [PATCH 40/41] Happy on GCC I hope

---
 Grid/random/gaussian.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/random/gaussian.h b/Grid/random/gaussian.h
index 4b15baa2..e29d00ac 100644
--- a/Grid/random/gaussian.h
+++ b/Grid/random/gaussian.h
@@ -146,7 +146,7 @@ gaussian_distribution<_RealType>::operator()(_URNG& __g, const param_type& __p)
             __v = _Uni(__g);
             __s = __u * __u + __v * __v;
         } while (__s > 1 || __s == 0);
-        result_type _Fp = _VSTD::sqrt(-2 * _VSTD::log(__s) / __s);
+        result_type _Fp = std::sqrt(-2 * std::log(__s) / __s);
         _V_ = __v * _Fp;
         _V_hot_ = true;
         _Up = __u * _Fp;

From 1c70d8c4d94c2de4fa24045efa1b552530139ea0 Mon Sep 17 00:00:00 2001
From: Quadro <Peter Boyle>
Date: Wed, 5 May 2021 19:56:04 -0400
Subject: [PATCH 41/41] Warning remove

---
 Grid/threads/Accelerator.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 56b85c72..6f5b1066 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -192,7 +192,7 @@ inline void *acceleratorAllocShared(size_t bytes)
   auto err = cudaMallocManaged((void **)&ptr,bytes);
   if( err != cudaSuccess ) {
     ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %lu %s \n",bytes,cudaGetErrorString(err));
   }
   return ptr;
 };
@@ -202,7 +202,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
   auto err = cudaMalloc((void **)&ptr,bytes);
   if( err != cudaSuccess ) {
     ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %lu %s \n",bytes,cudaGetErrorString(err));
   }
   return ptr;
 };