Merge branch 'feature/gparity_HMC' into feature/ddhmc

2025-08-26 07:57:09 +01:00 · 2021-05-06 20:55:03 +02:00
parent cff884929c 1c70d8c4d9
commit f776a7fe4a
51 changed files with 4291 additions and 426 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -34,6 +34,9 @@ directory

 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+#if defined __GNUC__ 
+#pragma GCC diagnostic ignored "-Wpsabi"
 #endif

 //disables and intel compiler specific warning (in json.hpp)
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
+#include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -292,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
+
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -48,6 +48,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+    RealD TrueResidual;

    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -79,6 +80,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;

    GridBase* DoublePrecGrid = src_d_in.Grid();
+
+    //Generate precision change workspaces
+    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
+    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
+
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
    
@@ -119,7 +125,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      
      sol_f = Zero();
@@ -137,7 +143,7 @@ NAMESPACE_BEGIN(Grid);
      
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -149,6 +155,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
+    TrueResidual = CG_d.TrueResidual;

    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;

-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -182,6 +182,9 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
+
+    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
+    
  
  ///////////////////////////////////////
  // Timers
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,411 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
+//The residual is stored in single precision, but the search directions and solution are stored in double precision. 
+//Every update_freq iterations the residual is corrected in double precision. 
+    
+//For safety the a final regular CG is applied to clean up if necessary
+
+//Linop to add shift to input linop, used in cleanup CG
+namespace ConjugateGradientMultiShiftMixedPrecSupport{
+template<typename Field>
+class ShiftedLinop: public LinearOperatorBase<Field>{
+public:
+  LinearOperatorBase<Field> &linop_base;
+  RealD shift;
+
+  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
+
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
+  
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+
+  void HermOp(const Field &in, Field &out){
+    linop_base.HermOp(in, out);
+    axpy(out, shift, in, out);
+  }    
+
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+};
+};
+
+
+template<class FieldD, class FieldF,
+	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
+					     public OperatorFunction<FieldD>
+{
+public:                                                
+
+  using OperatorFunction<FieldD>::operator();
+
+  RealD   Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
+  int verbose;
+  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;
+
+  int ReliableUpdateFreq; //number of iterations between reliable updates
+
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  LinearOperatorBase<FieldF> &Linop_f; //single precision
+
+  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
+				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
+				       int _ReliableUpdateFreq
+				       ) : 
+    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
+  { 
+    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
+  {
+    GridBase *grid = src.Grid();
+    int nshift = shifts.order;
+    std::vector<FieldD> results(nshift,grid);
+    (*this)(Linop,src,results,psi);
+  }
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
+  {
+    int nshift = shifts.order;
+
+    (*this)(Linop,src,results);
+  
+    psi = shifts.norm*src;
+    for(int i=0;i<nshift;i++){
+      psi = psi + shifts.residues[i]*results[i];
+    }
+
+    return;
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
+  { 
+    GridBase *DoublePrecGrid = src_d.Grid();
+    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
+    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
+
+    ////////////////////////////////////////////////////////////////////////
+    // Convenience references to the info stored in "MultiShiftFunction"
+    ////////////////////////////////////////////////////////////////////////
+    int nshift = shifts.order;
+
+    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+    std::vector<RealD> &mresidual(shifts.tolerances);
+    std::vector<RealD> alpha(nshift,1.0);
+
+    //Double precision search directions
+    FieldD p_d(DoublePrecGrid);
+    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
+
+    FieldD tmp_d(DoublePrecGrid);
+    FieldD r_d(DoublePrecGrid);
+    FieldD mmp_d(DoublePrecGrid);
+
+    assert(psi_d.size()==nshift);
+    assert(mass.size()==nshift);
+    assert(mresidual.size()==nshift);
+  
+    // dynamic sized arrays on stack; 2d is a pain with vector
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
+  
+    const int       primary =0;
+  
+    //Primary shift fields CG iteration
+    RealD a,b,c,d;
+    RealD cp,bp,qq; //prev
+  
+    // Matrix mult fields
+    FieldF r_f(SinglePrecGrid);
+    FieldF p_f(SinglePrecGrid);
+    FieldF tmp_f(SinglePrecGrid);
+    FieldF mmp_f(SinglePrecGrid);
+    FieldF src_f(SinglePrecGrid);
+    precisionChange(src_f, src_d, wk_f_from_d);
+
+    // Check lightest mass
+    for(int s=0;s<nshift;s++){
+      assert( mass[s]>= mass[primary] );
+      converged[s]=0;
+    }
+  
+    // Wire guess to zero
+    // Residuals "r" are src
+    // First search direction "p" is also src
+    cp = norm2(src_d);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi_d[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
+    for(int s=0;s<nshift;s++){
+      rsq[s] = cp * mresidual[s] * mresidual[s];
+      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
+      ps_d[s] = src_d;
+    }
+    // r and p for primary
+    r_f=src_f; //residual maintained in single
+    p_f=src_f;
+    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
+  
+    //MdagM+m[0]
+    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
+    axpy(mmp_f,mass[0],p_f,mmp_f);
+    RealD rn = norm2(p_f);
+    d += rn*mass[0];
+
+    b = -cp /d;
+  
+    // Set up the various shift variables
+    int       iz=0;
+    z[0][1-iz] = 1.0;
+    z[0][iz]   = 1.0;
+    bs[0]      = b;
+    for(int s=1;s<nshift;s++){
+      z[s][1-iz] = 1.0;
+      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+      bs[s]      = b*z[s][iz]; 
+    }
+  
+    // r += b[0] A.p[0]
+    // c= norm(r)
+    c=axpy_norm(r_f,b,mmp_f,r_f);
+  
+    for(int s=0;s<nshift;s++) {
+      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
+    }
+  
+    ///////////////////////////////////////
+    // Timers
+    ///////////////////////////////////////
+    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
+
+    SolverTimer.Start();
+  
+    // Iteration loop
+    int k;
+  
+    for (k=1;k<=MaxIterations;k++){    
+      a = c /cp;
+
+      //Update double precision search direction by residual
+      PrecChangeTimer.Start();
+      precisionChange(r_d, r_f, wk_d_from_f);
+      PrecChangeTimer.Stop();
+
+      AXPYTimer.Start();
+      axpy(p_d,a,p_d,r_d); 
+
+      for(int s=0;s<nshift;s++){
+	if ( ! converged[s] ) { 
+	  if (s==0){
+	    axpy(ps_d[s],a,ps_d[s],r_d);
+	  } else{
+	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
+	  }
+	}
+      }
+      AXPYTimer.Stop();
+
+      PrecChangeTimer.Start();
+      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
+      PrecChangeTimer.Stop();
+
+      cp=c;
+      MatrixTimer.Start();  
+      Linop_f.HermOp(p_f,mmp_f); 
+      d=real(innerProduct(p_f,mmp_f));    
+      MatrixTimer.Stop();  
+
+      AXPYTimer.Start();
+      axpy(mmp_f,mass[0],p_f,mmp_f);
+      AXPYTimer.Stop();
+      RealD rn = norm2(p_f);
+      d += rn*mass[0];
+    
+      bp=b;
+      b=-cp/d;
+    
+      // Toggle the recurrence history
+      bs[0] = b;
+      iz = 1-iz;
+      ShiftTimer.Start();
+      for(int s=1;s<nshift;s++){
+	if((!converged[s])){
+	  RealD z0 = z[s][1-iz];
+	  RealD z1 = z[s][iz];
+	  z[s][iz] = z0*z1*bp
+	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+	}
+      }
+      ShiftTimer.Stop();
+
+      //Update double precision solutions
+      AXPYTimer.Start();
+      for(int s=0;s<nshift;s++){
+	int ss = s;
+	if( (!converged[s]) ) { 
+	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
+	}
+      }
+
+      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
+      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
+      AXPYTimer.Stop();
+
+      c = c_f;
+
+      if(k % ReliableUpdateFreq == 0){
+	//Replace r with true residual
+	MatrixTimer.Start();  
+	Linop_d.HermOp(psi_d[0],mmp_d); 
+	MatrixTimer.Stop();  
+
+	AXPYTimer.Start();
+	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
+
+	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
+	AXPYTimer.Stop();
+
+	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
+	
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r_d, wk_f_from_d);
+	PrecChangeTimer.Stop();
+	c = c_d;
+      }
+    
+      // Convergence checks
+      int all_converged = 1;
+      for(int s=0;s<nshift;s++){
+      
+	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
+	
+	  RealD css  = c * z[s][iz]* z[s][iz];
+	
+	  if(css<rsq[s]){
+	    if ( ! converged[s] )
+	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    converged[s]=1;
+	  } else {
+	    all_converged=0;
+	  }
+
+	}
+      }
+
+      if ( all_converged ){
+
+	SolverTimer.Stop();
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+      
+	// Check answers 
+	for(int s=0; s < nshift; s++) { 
+	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
+	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
+	  axpy(r_d,-alpha[s],src_d,tmp_d);
+	  RealD rn = norm2(r_d);
+	  RealD cn = norm2(src_d);
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
+
+	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
+	  if(rn >= rsq[s]){
+	    CleanupTimer.Start();
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
+
+	    //Setup linear operators for final cleanup
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
+					       
+	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
+	    cg(src_d, psi_d[s]);
+	    
+	    TrueResidualShift[s] = cg.TrueResidual;
+	    CleanupTimer.Stop();
+	  }
+	}
+
+	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
+	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
+
+	IterationsToComplete = k;	
+
+	return;
+      }
+
+   
+    }
+    // ugly hack
+    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
+    //  assert(0);
+  }
+
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
+#include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@@ -0,0 +1,42 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_crc.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+{
+  autoView( buf_v , buf, CpuRead);
+  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
+}
+
+#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+
+NAMESPACE_END(Grid);
+
+
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -32,8 +32,9 @@
 #include <random>

 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
+#include <Grid/random/gaussian.h>

 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -142,8 +143,8 @@ public:

  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
-  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  std::vector<Grid::gaussian_distribution<RealD> >       _gaussian;
+  //  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;

  ///////////////////////
@@ -243,8 +244,8 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }

@@ -357,8 +358,8 @@ public:

    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
-    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
+    //    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }

@@ -515,11 +516,11 @@ public:

 template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
 template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
-template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}

 template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
 template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
-template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }

 NAMESPACE_END(Grid);
 #endif
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -777,7 +777,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int


 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;

@@ -1002,54 +1002,96 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }

-//Convert a Lattice from one precision to another
-template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
-{
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
-  for(int d=0;d<out.Grid()->Nd();d++){
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
-  }
-  out.Checkerboard() = in.Checkerboard();
-  GridBase *in_grid=in.Grid();
-  GridBase *out_grid = out.Grid();
-
-  typedef typename VobjOut::scalar_object SobjOut;
-  typedef typename VobjIn::scalar_object SobjIn;
-
-  int ndim = out.Grid()->Nd();
-  int out_nsimd = out_grid->Nsimd();
-    
-  std::vector<Coordinate > out_icoor(out_nsimd);
-      
-  for(int lane=0; lane < out_nsimd; lane++){
-    out_icoor[lane].resize(ndim);
-    out_grid->iCoorFromIindex(out_icoor[lane], lane);
-  }
-        
-  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
-  unvectorizeToLexOrdArray(in_slex_conv, in);
-    
-  autoView( out_v , out, CpuWrite);
-  thread_for(out_oidx,out_grid->oSites(),{
-    Coordinate out_ocoor(ndim);
-    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
-
-    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
-
-    Coordinate lcoor(out_grid->Nd());
-      
-    for(int lane=0; lane < out_nsimd; lane++){
-      for(int mu=0;mu<ndim;mu++)
-	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
-	
-      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
-      ptrs[lane] = &in_slex_conv[llex];
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
+class precisionChangeWorkspace{
+  std::pair<Integer,Integer>* fmap_device; //device pointer
+public:
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
+    assert(out_grid->Nd() == in_grid->Nd());
+    for(int d=0;d<out_grid->Nd();d++){
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
    }
-    merge(out_v[out_oidx], ptrs, 0);
-  });
+    int Nsimd_out = out_grid->Nsimd();
+
+    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
+    for(int lane=0; lane < out_grid->Nsimd(); lane++)
+      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
+  
+    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
+    thread_for(out_oidx,out_grid->oSites(),{
+	Coordinate out_ocorr; 
+	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
+      
+	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
+	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
+	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
+	
+	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
+	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
+	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
+	  int in_oidx = 0, in_lane = 0;
+	  for(int d=0;d<in_grid->_ndimension;d++){
+	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
+	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
+	  }
+	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
+	}
+      });
+
+    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
+    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
+    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
+    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
+  }
+
+  //Prevent moving or copying
+  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
+  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
+  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
+  
+  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
+
+  ~precisionChangeWorkspace(){
+    acceleratorFreeDevice(fmap_device);
+  }
+};
+
+
+//Convert a lattice of one precision to another. The input workspace contains the mapping data.
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
+  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  out.Checkerboard() = in.Checkerboard();
+  constexpr int Nsimd_out = VobjOut::Nsimd();
+
+  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
+
+  //Do the copy/precision change
+  autoView( out_v , out, AcceleratorWrite);
+  autoView( in_v , in, AcceleratorRead);
+
+  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
+      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
+      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
+	int in_oidx = fmap_osite[out_lane].first;
+	int in_lane = fmap_osite[out_lane].second;
+	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
+      }
+    });
 }

+//Convert a Lattice from one precision to another
+//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
+  precisionChange(out, in, workspace);
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
+GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");

 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
+  GridLogHMC.Active(1);

  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
+extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;

 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -39,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
-
  typedef Lattice<vLorentzColourMatrixD> GaugeField;

+  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
+  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
+
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -198,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
      
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
+#define GparityFlavourIndex (0)

 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;

+const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;


+template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;

 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;

+//G-parity flavour matrix
+typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
+typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
+
+typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
+typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
+
+
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -220,6 +235,16 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
+
+//G-parity flavour vector
+typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
+typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
+typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
+
+typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
+typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
+typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
+
    
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);

 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  GparityWilsonImplParams() : twists(Nd, 0) {};
 };
  
@@ -85,6 +86,50 @@ struct StaggeredImplParams {
        precision(_precision),
        BoundsCheckFreq(_BoundsCheckFreq){};
  };
+
+
+  /*Action parameters for the generalized rational action
+    The approximation is for (M^dag M)^{1/inv_pow}
+    where inv_pow is the denominator of the fractional power.
+    Default inv_pow=2 for square root, making this equivalent to 
+    the OneFlavourRational action
+  */
+    struct RationalActionParams : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
+				    int, inv_pow, 
+				    RealD, lo, //low eigenvalue bound of rational approx
+				    RealD, hi, //high eigenvalue bound of rational approx
+				    int,   MaxIter,  //maximum iterations in msCG
+				    RealD, action_tolerance,  //msCG tolerance in action evaluation
+				    int,   action_degree, //rational approx tolerance in action evaluation
+				    RealD, md_tolerance,  //msCG tolerance in MD integration
+				    int,   md_degree, //rational approx tolerance in MD integration
+				    int,   precision, //precision of floating point arithmetic
+				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
+  // constructor 
+  RationalActionParams(int _inv_pow = 2,
+		       RealD _lo      = 0.0, 
+		       RealD _hi      = 1.0, 
+		       int _maxit     = 1000,
+		       RealD _action_tolerance      = 1.0e-8, 
+		       int _action_degree    = 10,
+		       RealD _md_tolerance      = 1.0e-8, 
+		       int _md_degree    = 10,
+		       int _precision = 64,
+		       int _BoundsCheckFreq=20)
+    : inv_pow(_inv_pow), 
+      lo(_lo),
+      hi(_hi),
+      MaxIter(_maxit),
+      action_tolerance(_action_tolerance),
+      action_degree(_action_degree),
+      md_tolerance(_md_tolerance),
+      md_degree(_md_degree),
+      precision(_precision),
+      BoundsCheckFreq(_BoundsCheckFreq){};
+  };
+
+
  
 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,6 +30,18 @@ directory

 NAMESPACE_BEGIN(Grid);

+/*
+  Policy implementation for G-parity boundary conditions
+
+  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
+  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
+  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
+  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
+  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
+
+  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
+ */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));

-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction

    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));

-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-
+    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);

 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }

+
+  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
+  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
+    autoView(poke_f0_v, poke_f0, CpuRead);
+    autoView(poke_f1_v, poke_f1, CpuRead);
+    autoView(Uds_v, Uds, CpuWrite);
+    thread_foreach(ss,poke_f0_v,{
+	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
+	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
+      });
+  }
+    
+
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -207,14 +232,19 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
   
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-        
-    for(int mu=0;mu<Nd;mu++){
-          
-      LatticeCoordinate(coor,mu);
+
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+    for(int mu=0;mu<Nd-1;mu++){
+
+      if( Params.twists[mu] ){
+	LatticeCoordinate(coor,mu);
+      }
          
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
     
+      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
          
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -260,6 +290,38 @@ public:
        });
      }
    }
+
+    { //periodic / antiperiodic temporal BCs
+      int mu = Nd-1;
+      int L   = GaugeGrid->GlobalDimensions()[mu];
+      int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
+      
+      GaugeLinkField *Upoke = &U;
+
+      if(Params.twists[mu]){ //antiperiodic
+	Utmp =  where(coor == Lmu, -U, U);
+	Upoke = &Utmp;
+      }
+    
+      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
+
+      //Get the barrel-shifted field
+      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
+      Upoke = &Utmp;
+
+      if(Params.twists[mu]){
+	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
+	Upoke = &U;
+      }
+      
+      Uconj = conjugate(*Upoke);
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
+    }
  }
      
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -298,28 +360,48 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
-  
+ 
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
-    int Ls = Btilde.Grid()->_fdimensions[0];
-        
-    GaugeLinkField tmp(mat.Grid());
-    tmp = Zero();
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    
    {
-      autoView( tmp_v , tmp, CpuWrite);
-      autoView( Atilde_v , Atilde, CpuRead);
-      autoView( Btilde_v , Btilde, CpuRead);
-      thread_for(ss,tmp.Grid()->oSites(),{
-	  for (int s = 0; s < Ls; s++) {
-	    int sF = s + Ls * ss;
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
-	  }
-	});
+      GridBase *GaugeGrid = mat.Grid();
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+
+      if( Params.twists[mu] ){
+	LatticeCoordinate(coor,mu);
+      }
+
+      autoView( mat_v , mat, AcceleratorWrite);
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
+  	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+  	  zeroit(sum);
+  	  for(int s=0;s<Ls;s++){
+  	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+	      //Flavor 0
+  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
+  	      sum = sum + outerProduct(bb,aa);
+
+  	      //Flavor 1
+  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
+  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
+  	      sum = sum + conjugate(outerProduct(bb,aa));
+  	    }
+  	  }	    
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+  	});
    }
-    PokeIndex<LorentzIndex>(mat, tmp, mu);
-    return;
  }
+
+
+  
+
  
 };

--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@@ -40,13 +40,66 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }

+    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
+       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
+       for noise X (aka GaussNoise).
+       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
+    */
+    template<class Field> void InversePowerBoundsCheck(int inv_pow,
+						       int MaxIter,double tol,
+						       LinearOperatorBase<Field> &HermOp,
+						       Field &GaussNoise,
+						       MultiShiftFunction &ApproxNegPow) 
+    {
+      GridBase *FermionGrid = GaussNoise.Grid();
+
+      Field X(FermionGrid);
+      Field Y(FermionGrid);
+      Field Z(FermionGrid);
+
+      Field tmp1(FermionGrid), tmp2(FermionGrid);
+
+      X=GaussNoise;
+      RealD Nx = norm2(X);
+
+      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
+
+      tmp1 = X;
+      
+      Field* in = &tmp1;
+      Field* out = &tmp2;
+      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
+	msCG(HermOp, *in, *out); //backwards conventions!
+	if(i!=inv_pow-1) std::swap(in, out);
+      }
+      Z = *out;
+
+      RealD Nz = norm2(Z);
+
+      HermOp.HermOp(Z,Y);
+      RealD Ny = norm2(Y);
+
+      X=X-Y;
+      RealD Nd = norm2(X);
+      std::cout << "************************* "<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
+      std::cout << "************************* "<<std::endl;
+      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
+    }
+
+
 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
@@ -0,0 +1,372 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators
+    /////////////////////////////////////////////////////////
+
+    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
+           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
+	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
+	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
+
+	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    
+       BIG WARNING:	   
+       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
+       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
+       Thus for DWF the numerator operator is the Pauli-Villars operator
+
+       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
+       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
+    */
+      
+    template<class Impl>
+    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef RationalActionParams Params;
+      Params param;
+
+      //For action evaluation
+      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
+
+      //For the MD integration
+      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+
+      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
+      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
+      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
+	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
+	double error = remez.generateApprox(approx_degree,1,inv_pow);	
+	if(error > CG_tolerance)
+	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
+	
+	approx.Init(remez, CG_tolerance,false);
+	approx_inv.Init(remez, CG_tolerance,true);
+      }
+
+
+    protected:
+      static constexpr bool Numerator = true;
+      static constexpr bool Denominator = false;
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const GaugeField &U){
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+      }
+      
+    public:
+
+      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+						     FermionOperator<Impl>  &_DenOp, 
+						     const Params & p
+						     ) : 
+	NumOp(_NumOp), 
+	DenOp(_DenOp), 
+	PhiOdd (_NumOp.FermionRedBlackGrid()),
+	PhiEven(_NumOp.FermionRedBlackGrid()),
+	param(p) 
+      {
+	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	//Generate approximations for action eval
+	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
+	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
+
+	//Generate approximations for MD
+	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
+	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
+	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
+	}else{
+	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
+	  ApproxPowerMD = ApproxPowerAction; 
+	  ApproxNegPowerMD = ApproxNegPowerAction;
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
+
+	  ApproxHalfPowerMD = ApproxHalfPowerAction;
+	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
+	}
+
+	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
+      };
+
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
+	return sstream.str();
+      }
+
+      //Access the fermion field
+      const FermionField &getPhiOdd() const{ return PhiOdd; }
+      
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+	FermionField eta(NumOp.FermionGrid());	
+
+	// P(eta) \propto e^{- eta^dag eta}
+	//	
+	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
+	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
+	RealD scale = std::sqrt(0.5);
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	refresh(U,eta);
+      }
+
+      //Allow for manual specification of random field for testing
+      void refresh(const GaugeField &U, const FermionField &eta) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
+	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
+	//
+	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
+	
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+
+	FermionField etaOdd (NumOp.FermionRedBlackGrid());
+	FermionField etaEven(NumOp.FermionRedBlackGrid());
+	FermionField     tmp(NumOp.FermionRedBlackGrid());
+
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	ImportGauge(U);
+
+	// MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
+	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
+
+	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
+	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
+		
+	assert(NumOp.ConstEE() == 1);
+	assert(DenOp.ConstEE() == 1);
+	PhiEven = Zero();
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
+	ImportGauge(U);
+
+	FermionField X(NumOp.FermionRedBlackGrid());
+	FermionField Y(NumOp.FermionRedBlackGrid());
+
+	// VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
+
+	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
+
+	// Randomly apply rational bounds checks.
+	int rcheck = rand();
+	auto grid = NumOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+
+	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
+	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
+	  FermionField gauss(NumOp.FermionRedBlackGrid());
+	  gauss = PhiOdd;
+	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
+	  HighBoundCheck(MdagM,gauss,param.hi);
+	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
+	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
+	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
+	}
+
+	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	RealD action = norm2(Y);
+	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
+	const int n_f  = ApproxNegPowerMD.poles.size();
+	const int n_pv = ApproxHalfPowerMD.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
+
+	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField           Y(NumOp.FermionRedBlackGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	ImportGauge(U);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+		
+
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+
+
+	RealD ak;
+
+	dSdU = Zero();
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)	
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
+	for(int k=0;k<n_f;k++){
+	  ak = ApproxNegPowerMD.residues[k];
+	  MdagM.Mpc(MfMpvPhi_k[k],Y);
+	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
+	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
+	for(int k=0;k<n_pv;k++){
+
+          ak = ApproxHalfPowerMD.residues[k];
+	  
+	  VdagV.Mpc(MpvPhi_k[k],Y);
+	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
+	  
+	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
+
+	}
+
+	//dSdU = Ta(dSdU);
+	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
+      };
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
+    // cf. GeneralEvenOddRational.h for details
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      
+    template<class ImplD, class ImplF>
+    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
+    private:
+      typedef typename ImplD::FermionField FermionFieldD;
+      typedef typename ImplF::FermionField FermionFieldF;
+
+      FermionOperator<ImplD> & NumOpD;
+      FermionOperator<ImplD> & DenOpD;
+     
+      FermionOperator<ImplF> & NumOpF;
+      FermionOperator<ImplF> & DenOpF;
+
+      Integer ReliableUpdateFreq;
+    protected:
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
+	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
+	precisionChange(Uf, Ud);
+	
+	NumOpD.ImportGauge(Ud);
+	DenOpD.ImportGauge(Ud);
+
+	NumOpF.ImportGauge(Uf);
+	DenOpF.ImportGauge(Uf);
+      }
+      
+    public:
+      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
+							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
+							      const RationalActionParams & p, Integer _ReliableUpdateFreq
+							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
+								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
+      
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -40,249 +40,31 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
    public:
-
-      INHERIT_IMPL_TYPES(Impl);
-
      typedef OneFlavourRationalParams Params;
-      Params param;
-
-      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
-      MultiShiftFunction PowerQuarter;
-      MultiShiftFunction PowerNegQuarter;
-
    private:
-     
-      FermionOperator<Impl> & NumOp;// the basic operator
-      FermionOperator<Impl> & DenOp;// the basic operator
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+      static RationalActionParams transcribe(const Params &in){
+	RationalActionParams out;
+	out.inv_pow = 2;
+	out.lo = in.lo;
+	out.hi = in.hi;
+	out.MaxIter = in.MaxIter;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
+	out.action_degree = out.md_degree = in.degree;
+	out.precision = in.precision;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
+	return out;
+      }

    public:
-
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-					    FermionOperator<Impl>  &_DenOp, 
-					    Params & p
-					    ) : 
-      NumOp(_NumOp), 
-      DenOp(_DenOp), 
-      PhiOdd (_NumOp.FermionRedBlackGrid()),
-      PhiEven(_NumOp.FermionRedBlackGrid()),
-      param(p) 
-      {
-	AlgRemez remez(param.lo,param.hi,param.precision);
+							FermionOperator<Impl>  &_DenOp, 
+							const Params & p
+							) : 
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}

-	// MdagM^(+- 1/2)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-	remez.generateApprox(param.degree,1,2);
-	PowerHalf.Init(remez,param.tolerance,false);
-	PowerNegHalf.Init(remez,param.tolerance,true);
-
-	// MdagM^(+- 1/4)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
-	remez.generateApprox(param.degree,1,4);
-   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-      };
-
-      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
-
-      virtual std::string LogParameters(){
-	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
-	return sstream.str();
-      }
-      
-      
-      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
-
-	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-	//
-	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
-	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
-	//
-	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
-
-	RealD scale = std::sqrt(0.5);
-
-	FermionField eta(NumOp.FermionGrid());
-	FermionField etaOdd (NumOp.FermionRedBlackGrid());
-	FermionField etaEven(NumOp.FermionRedBlackGrid());
-	FermionField     tmp(NumOp.FermionRedBlackGrid());
-
-	gaussian(pRNG,eta);	eta=eta*scale;
-
-	pickCheckerboard(Even,etaEven,eta);
-	pickCheckerboard(Odd,etaOdd,eta);
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-
-	// MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
-	msCG_M(MdagM,etaOdd,tmp);
-
-	// VdagV^-1/4 MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
-	msCG_V(VdagV,tmp,PhiOdd);
-
-	assert(NumOp.ConstEE() == 1);
-	assert(DenOp.ConstEE() == 1);
-	PhiEven = Zero();
-	
-      };
-
-      //////////////////////////////////////////////////////
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	FermionField X(NumOp.FermionRedBlackGrid());
-	FermionField Y(NumOp.FermionRedBlackGrid());
-
-	// VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	msCG_V(VdagV,PhiOdd,X);
-
-	// MdagM^-1/4 VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
-	msCG_M(MdagM,X,Y);
-
-	// Randomly apply rational bounds checks.
-	auto grid = NumOp.FermionGrid();
-        auto r=rand();
-        grid->Broadcast(0,r);
-        if ( (r%param.BoundsCheckFreq)==0 ) { 
-	  FermionField gauss(NumOp.FermionRedBlackGrid());
-	  gauss = PhiOdd;
-	  HighBoundCheck(MdagM,gauss,param.hi);
-	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
-	}
-
-	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
-	RealD action = norm2(Y);
-
-	return action;
-      };
-
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //
-      // Here, M is some 5D operator and V is the Pauli-Villars field
-      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
-      //
-      // Need  
-      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
-      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
-      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
-      //
-      // P/Q is expressed as partial fraction expansion: 
-      // 
-      //           a0 + \sum_k ak/(V^dagV + bk) 
-      //  
-      // d[P/Q] is then  
-      //
-      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
-      //  
-      // and similar for N/D. 
-      // 
-      // Need   
-      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
-      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
-      //   
-      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
-      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
-      // 
-      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
-      //  
-
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
-	const int n_f  = PowerNegHalf.poles.size();
-	const int n_pv = PowerQuarter.poles.size();
-
-	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
-
-	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField           Y(NumOp.FermionRedBlackGrid());
-
-	GaugeField   tmp(NumOp.GaugeGrid());
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
-
-	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
-	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
-	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
-
-	RealD ak;
-
-	dSdU = Zero();
-
-	// With these building blocks  
-	//  
-	//       dS/dU = 
-	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
-	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
-	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
-
-	//(1)
-	for(int k=0;k<n_f;k++){
-	  ak = PowerNegHalf.residues[k];
-	  MdagM.Mpc(MfMpvPhi_k[k],Y);
-	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
-	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
-	}
-	
-	//(2)
-	//(3)
-	for(int k=0;k<n_pv;k++){
-
-          ak = PowerQuarter.residues[k];
-	  
-	  VdagV.Mpc(MpvPhi_k[k],Y);
-	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
-	  
-	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
-	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
-
-	}
-
-	//dSdU = Ta(dSdU);
-
-      };
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
    };

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/pseudofermion/PseudoFermion.h
+++ b/Grid/qcd/action/pseudofermion/PseudoFermion.h
@@ -40,6 +40,8 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>

--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -83,16 +83,7 @@ NAMESPACE_BEGIN(Grid);
 	return sstream.str();
      } 

-      
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
-
-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
-        //
-        // NumOp == V
-        // DenOp == M
-        //
-        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
-        //
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@@ -100,12 +91,22 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);

        FermionField eta    (NumOp.FermionGrid());
+        gaussian(pRNG,eta); eta = eta * scale;
+
+	refresh(U,eta);
+      }
+	
+      void refresh(const GaugeField &U, const FermionField &eta) {
+        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+        //
+        // NumOp == V
+        // DenOp == M
+        //
+        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());

-        gaussian(pRNG,eta);
-
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);

@@ -125,8 +126,8 @@ NAMESPACE_BEGIN(Grid);
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);

-        PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
+        //PhiOdd =PhiOdd*scale;
+        //PhiEven=PhiEven*scale;
        
      };

--- a/Grid/qcd/gparity/Gparity.h
+++ b/Grid/qcd/gparity/Gparity.h
@@ -0,0 +1,6 @@
+#ifndef GRID_GPARITY_H_
+#define GRID_GPARITY_H_
+
+#include<Grid/qcd/gparity/GparityFlavour.h>
+
+#endif
--- a/Grid/qcd/gparity/GparityFlavour.cc
+++ b/Grid/qcd/gparity/GparityFlavour.cc
@@ -0,0 +1,34 @@
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
+    GparityFlavour(GparityFlavour::Algebra::SigmaX),
+    GparityFlavour(GparityFlavour::Algebra::SigmaY),
+    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
+    }};
+
+const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
+  GparityFlavour(GparityFlavour::Algebra::Identity),
+  GparityFlavour(GparityFlavour::Algebra::SigmaX),
+  GparityFlavour(GparityFlavour::Algebra::SigmaY),
+  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
+  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
+  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
+}};
+
+const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
+    "SigmaX",
+    "MinusSigmaX",
+    "SigmaY",
+    "MinusSigmaY",
+    "SigmaZ",
+    "MinusSigmaZ",
+    "Identity",
+    "MinusIdentity",
+    "ProjPlus",
+    "MinusProjPlus",
+    "ProjMinus",
+    "MinusProjMinus"}};
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/gparity/GparityFlavour.h
+++ b/Grid/qcd/gparity/GparityFlavour.h
@@ -0,0 +1,475 @@
+#ifndef GRID_QCD_GPARITY_FLAVOUR_H
+#define GRID_QCD_GPARITY_FLAVOUR_H
+
+//Support for flavour-matrix operations acting on the G-parity flavour index
+
+#include <array>
+
+NAMESPACE_BEGIN(Grid);
+
+class GparityFlavour {
+  public:
+    GRID_SERIALIZABLE_ENUM(Algebra, undef,
+                           SigmaX, 0,
+			   MinusSigmaX, 1,
+                           SigmaY, 2,
+			   MinusSigmaY, 3,
+                           SigmaZ, 4,
+			   MinusSigmaZ, 5,
+			   Identity, 6,
+			   MinusIdentity, 7,
+			   ProjPlus, 8,
+			   MinusProjPlus, 9,
+			   ProjMinus, 10,
+			   MinusProjMinus, 11
+			   );
+    static constexpr unsigned int nSigma = 12;
+    static const std::array<const char *, nSigma>                name;
+    static const std::array<const GparityFlavour, 3>             sigma_mu;
+    static const std::array<const GparityFlavour, 6>            sigma_all;
+    Algebra                                                      g;
+  public:
+  accelerator GparityFlavour(Algebra initg): g(initg) {}  
+};
+
+
+
+// 0 1  x   vector
+// 1 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(1);
+  ret(1) = rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(1,0);
+  ret(0,1) = rhs(1,1);
+  ret(1,0) = rhs(0,0);
+  ret(1,1) = rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,1);
+  ret(0,1) = rhs(0,0);
+  ret(1,0) = rhs(1,1);
+  ret(1,1) = rhs(1,0);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(1);
+  ret(1) = -rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(1,0);
+  ret(0,1) = -rhs(1,1);
+  ret(1,0) = -rhs(0,0);
+  ret(1,1) = -rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,1);
+  ret(0,1) = -rhs(0,0);
+  ret(1,0) = -rhs(1,1);
+  ret(1,1) = -rhs(1,0);
+};
+
+
+
+
+
+// 0 -i  x   vector
+// i 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesMinusI(rhs(1));
+  ret(1) = timesI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(1,0));
+  ret(0,1) = timesMinusI(rhs(1,1));
+  ret(1,0) = timesI(rhs(0,0));
+  ret(1,1) = timesI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(0,1));
+  ret(0,1) = timesMinusI(rhs(0,0));
+  ret(1,0) = timesI(rhs(1,1));
+  ret(1,1) = timesMinusI(rhs(1,0));
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesI(rhs(1));
+  ret(1) = timesMinusI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(1,0));
+  ret(0,1) = timesI(rhs(1,1));
+  ret(1,0) = timesMinusI(rhs(0,0));
+  ret(1,1) = timesMinusI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(0,1));
+  ret(0,1) = timesI(rhs(0,0));
+  ret(1,0) = timesMinusI(rhs(1,1));
+  ret(1,1) = timesI(rhs(1,0));
+};
+
+
+
+
+
+// 1 0  x   vector
+// 0 -1
+template<class vtype>
+accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+
+
+
+
+
+template<class vtype>
+accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1+\sigma_2)
+//1 -i
+//i  1
+template<class vtype>
+accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1-\sigma_2)
+//1 i
+//-i  1
+template<class vtype>
+accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+
+
+
+
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
+{
+  iVector<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    multFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    multFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    multFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    multFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    multFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    multFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    multFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    multFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    multFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    multFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    multFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    multFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+ 
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    lmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    lmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    lmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    lmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    lmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    lmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    lmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    lmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    lmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    lmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    lmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    lmultFlavourMinusProjMinus(ret, arg); break;  
+  default: assert(0);
+  }
+  
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    rmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    rmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    rmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    rmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    rmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    rmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    rmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    rmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    rmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    rmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    rmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    rmultFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif // include guard
--- a/Grid/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -129,18 +129,10 @@ public:
    Runner(S);
  }

-  //////////////////////////////////////////////////////////////////
-
-private:
-  template <class SmearingPolicy>
-  void Runner(SmearingPolicy &Smearing) {
-    auto UGrid = Resources.GetCartesian();
-    Resources.AddRNGs();
-    Field U(UGrid);
-
-    // Can move this outside?
-    typedef IntegratorType<SmearingPolicy> TheIntegrator;
-    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
+  void initializeGaugeFieldAndRNGs(Field &U){
+    if(!Resources.haveRNGs()) Resources.AddRNGs();

    if (Parameters.StartingType == "HotStart") {
      // Hot start
@@ -167,6 +159,22 @@ private:
 	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
      exit(1);
    }
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////
+
+private:
+  template <class SmearingPolicy>
+  void Runner(SmearingPolicy &Smearing) {
+    auto UGrid = Resources.GetCartesian();
+    Field U(UGrid);
+
+    initializeGaugeFieldAndRNGs(U);
+
+    typedef IntegratorType<SmearingPolicy> TheIntegrator;
+    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);

    Smearing.set_Field(U);

--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -115,21 +115,21 @@ private:

    random(sRNG, rn_test);

-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
+    std::cout << GridLogHMC << "exp(-dH) = " << prob
              << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage
+    std::cout << GridLogHMC
              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";

    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC
                << "--------------------------------------------------\n";
      return false;
    }
@@ -145,7 +145,7 @@ private:

    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);

    TheIntegrator.integrate(U);
@@ -165,7 +165,7 @@ private:


    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+    std::cout << GridLogHMC << "Total H after trajectory  = " << H1
 	      << "  dH = " << H1 - H0 << "\n";
    std::cout.precision(current_precision);
    
@@ -196,9 +196,9 @@ public:
    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
      
      double t0=usecond();
@@ -207,10 +207,10 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
      }

      if (accept)
@@ -219,7 +219,7 @@ public:
     
      
      double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;


      for (int obs = 0; obs < Observables.size(); obs++) {
@@ -228,7 +228,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }

--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@@ -226,6 +226,9 @@ public:
  //////////////////////////////////////////////////////
  // Random number generators
  //////////////////////////////////////////////////////
+  
+  //Return true if the RNG objects have been instantiated
+  bool haveRNGs() const{ return have_RNG; }

  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -136,8 +136,14 @@ protected:
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+
+      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
+      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      Real max_force_abs = std::sqrt(maxLocalNorm2(force));
+      Real max_impulse_abs = max_force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << " Max force: " << max_force_abs << " Time step: " << ep << " Impulse average: " << impulse_abs << " Max impulse: " << max_impulse_abs << std::endl;
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
--- a/Grid/sitmo_rng/README
+++ b/Grid/sitmo_rng/README
--- a/Grid/random/gaussian.h
+++ b/Grid/random/gaussian.h
@@ -0,0 +1,200 @@
+// -*- C++ -*-
+//===--------------------------- random -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Peter Boyle: Taken from libc++ in Clang/LLVM.
+// Reason is that libstdc++ and clang differ in their return order in the normal_distribution / box mueller type step.
+// standardise on one and call it "gaussian_distribution".
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cmath>
+#include <type_traits>
+#include <initializer_list>
+#include <limits>
+#include <algorithm>
+#include <numeric>
+#include <vector>
+#include <string>
+#include <istream>
+#include <ostream>
+#include <random>
+
+// normal_distribution -> gaussian distribution
+namespace Grid {
+
+template<class _RealType = double>
+class  gaussian_distribution
+{
+public:
+    // types
+    typedef _RealType result_type;
+
+    class param_type
+    {
+        result_type __mean_;
+        result_type __stddev_;
+    public:
+        typedef gaussian_distribution distribution_type;
+
+        strong_inline
+        explicit param_type(result_type __mean = 0, result_type __stddev = 1)
+            : __mean_(__mean), __stddev_(__stddev) {}
+
+        strong_inline
+        result_type mean() const {return __mean_;}
+        strong_inline
+        result_type stddev() const {return __stddev_;}
+
+        friend strong_inline
+            bool operator==(const param_type& __x, const param_type& __y)
+            {return __x.__mean_ == __y.__mean_ && __x.__stddev_ == __y.__stddev_;}
+        friend strong_inline
+            bool operator!=(const param_type& __x, const param_type& __y)
+            {return !(__x == __y);}
+    };
+
+private:
+    param_type __p_;
+    result_type _V_;
+    bool _V_hot_;
+
+public:
+    // constructors and reset functions
+    strong_inline
+    explicit gaussian_distribution(result_type __mean = 0, result_type __stddev = 1)
+        : __p_(param_type(__mean, __stddev)), _V_hot_(false) {}
+    strong_inline
+    explicit gaussian_distribution(const param_type& __p)
+        : __p_(__p), _V_hot_(false) {}
+    strong_inline
+    void reset() {_V_hot_ = false;}
+
+    // generating functions
+    template<class _URNG>
+        strong_inline
+        result_type operator()(_URNG& __g)
+        {return (*this)(__g, __p_);}
+    template<class _URNG> result_type operator()(_URNG& __g, const param_type& __p);
+
+    // property functions
+    strong_inline
+    result_type mean() const {return __p_.mean();}
+    strong_inline
+    result_type stddev() const {return __p_.stddev();}
+
+    strong_inline
+    param_type param() const {return __p_;}
+    strong_inline
+    void param(const param_type& __p) {__p_ = __p;}
+
+    strong_inline
+    result_type min() const {return -std::numeric_limits<result_type>::infinity();}
+    strong_inline
+    result_type max() const {return std::numeric_limits<result_type>::infinity();}
+
+    friend strong_inline
+        bool operator==(const gaussian_distribution& __x,
+                        const gaussian_distribution& __y)
+        {return __x.__p_ == __y.__p_ && __x._V_hot_ == __y._V_hot_ &&
+                (!__x._V_hot_ || __x._V_ == __y._V_);}
+    friend strong_inline
+        bool operator!=(const gaussian_distribution& __x,
+                        const gaussian_distribution& __y)
+        {return !(__x == __y);}
+
+    template <class _CharT, class _Traits, class _RT>
+    friend
+    std::basic_ostream<_CharT, _Traits>&
+    operator<<(std::basic_ostream<_CharT, _Traits>& __os,
+               const gaussian_distribution<_RT>& __x);
+
+    template <class _CharT, class _Traits, class _RT>
+    friend
+    std::basic_istream<_CharT, _Traits>&
+    operator>>(std::basic_istream<_CharT, _Traits>& __is,
+               gaussian_distribution<_RT>& __x);
+};
+
+template <class _RealType>
+template<class _URNG>
+_RealType
+gaussian_distribution<_RealType>::operator()(_URNG& __g, const param_type& __p)
+{
+    result_type _Up;
+    if (_V_hot_)
+    {
+        _V_hot_ = false;
+        _Up = _V_;
+    }
+    else
+    {
+        std::uniform_real_distribution<result_type> _Uni(-1, 1);
+        result_type __u;
+        result_type __v;
+        result_type __s;
+        do
+        {
+            __u = _Uni(__g);
+            __v = _Uni(__g);
+            __s = __u * __u + __v * __v;
+        } while (__s > 1 || __s == 0);
+        result_type _Fp = std::sqrt(-2 * std::log(__s) / __s);
+        _V_ = __v * _Fp;
+        _V_hot_ = true;
+        _Up = __u * _Fp;
+    }
+    return _Up * __p.stddev() + __p.mean();
+}
+
+template <class _CharT, class _Traits, class _RT>
+std::basic_ostream<_CharT, _Traits>&
+operator<<(std::basic_ostream<_CharT, _Traits>& __os,
+           const gaussian_distribution<_RT>& __x)
+{
+    auto __save_flags = __os.flags();
+    __os.flags(std::ios_base::dec | std::ios_base::left | std::ios_base::fixed |
+               std::ios_base::scientific);
+    _CharT __sp = __os.widen(' ');
+    __os.fill(__sp);
+    __os << __x.mean() << __sp << __x.stddev() << __sp << __x._V_hot_;
+    if (__x._V_hot_)
+        __os << __sp << __x._V_;
+    __os.flags(__save_flags);
+    return __os;
+}
+
+template <class _CharT, class _Traits, class _RT>
+std::basic_istream<_CharT, _Traits>&
+operator>>(std::basic_istream<_CharT, _Traits>& __is,
+           gaussian_distribution<_RT>& __x)
+{
+    typedef gaussian_distribution<_RT> _Eng;
+    typedef typename _Eng::result_type result_type;
+    typedef typename _Eng::param_type param_type;
+    auto __save_flags = __is.flags();
+    __is.flags(std::ios_base::dec | std::ios_base::skipws);
+    result_type __mean;
+    result_type __stddev;
+    result_type _Vp = 0;
+    bool _V_hot = false;
+    __is >> __mean >> __stddev >> _V_hot;
+    if (_V_hot)
+        __is >> _Vp;
+    if (!__is.fail())
+    {
+        __x.param(param_type(__mean, __stddev));
+        __x._V_hot_ = _V_hot;
+        __x._V_ = _Vp;
+    }
+    __is.flags(__save_flags);
+    return __is;
+}
+}
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -208,5 +208,46 @@ void merge(vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
 }


+
+//////////////////////////////////////////////////////////////////////////////////
+//Copy a single lane of a SIMD tensor type from one object to another
+//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
+///////////////////////////////////////////////////////////////////////////////////
+template<class vobjOut, class vobjIn>
+accelerator_inline 
+void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
+{
+  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  typedef typename vobjOut::vector_type ovector_type;  
+  typedef typename vobjIn::vector_type ivector_type;  
+  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
+  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
+  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
+
+  typedef typename vobjOut::scalar_type oscalar_type;  
+  typedef typename vobjIn::scalar_type iscalar_type;  
+  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
+  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
+
+  typedef oextract_type * opointer;
+  typedef iextract_type * ipointer;
+
+  constexpr int oNsimd=ovector_type::Nsimd();
+  constexpr int iNsimd=ivector_type::Nsimd();
+
+  iscalar_type itmp;
+  oscalar_type otmp;
+
+  opointer __restrict__  op = (opointer)&vecOut;
+  ipointer __restrict__  ip = (ipointer)&vecIn;
+  for(int w=0;w<owords;w++){
+    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
+    otmp = itmp; //potential precision change
+    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
+  }
+}
+
+
 NAMESPACE_END(Grid);

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -192,7 +192,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %lu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
@@ -202,7 +202,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %lu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };