Merge pull request #407 from giltirn/feature/dirichlet-gparity-stage

Import round 4
Imported changes from feature/gparity_HMC branch:
2026-06-18 18:03:44 +01:00 · 2022-06-22 15:23:36 -04:00 · 2022-06-22 10:27:48 -04:00 · 2022-06-06 18:45:37 -04:00 · 2022-06-02 15:30:41 -04:00 · 2022-05-25 13:29:45 -04:00
158 changed files with 17902 additions and 17301 deletions
@@ -44,22 +44,14 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
-
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
- //disables nvcc specific warning in json.hpp
-#pragma nv_diag_suppress unsigned_compare_with_zero
-#pragma nv_diag_suppress cast_to_qualified_type
- //disables nvcc specific warning in many files
-#pragma nv_diag_suppress esa_on_defaulted_function_ignored
-#pragma nv_diag_suppress extra_semicolon
-#else
- //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
+
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-#endif
+
+//Eigen only
 #endif

 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
+#include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
@@ -14,11 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
-#pragma nv_diag_suppress code_is_unreachable
-#else
 #pragma diag_suppress code_is_unreachable
-#endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -262,7 +262,7 @@ public:
 	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
-	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
@@ -264,7 +264,7 @@ public:
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
-      accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
@@ -49,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+    RealD TrueResidual;

    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -68,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
    }
  
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
+    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
 	
    GridStopWatch TotalTimer;
@@ -97,6 +99,7 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;

@@ -130,6 +133,7 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);

      //Inner CG
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -150,6 +154,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
+    TrueResidual = CG_d.TrueResidual;

    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;

-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -182,6 +182,9 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
+
+    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
+    
  
  ///////////////////////////////////////
  // Timers
@@ -0,0 +1,409 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
+//The residual is stored in single precision, but the search directions and solution are stored in double precision. 
+//Every update_freq iterations the residual is corrected in double precision. 
+    
+//For safety the a final regular CG is applied to clean up if necessary
+
+//Linop to add shift to input linop, used in cleanup CG
+namespace ConjugateGradientMultiShiftMixedPrecSupport{
+template<typename Field>
+class ShiftedLinop: public LinearOperatorBase<Field>{
+public:
+  LinearOperatorBase<Field> &linop_base;
+  RealD shift;
+
+  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
+
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
+  
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+
+  void HermOp(const Field &in, Field &out){
+    linop_base.HermOp(in, out);
+    axpy(out, shift, in, out);
+  }    
+
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+};
+};
+
+
+template<class FieldD, class FieldF,
+	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
+					     public OperatorFunction<FieldD>
+{
+public:                                                
+
+  using OperatorFunction<FieldD>::operator();
+
+  RealD   Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
+  int verbose;
+  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;
+
+  int ReliableUpdateFreq; //number of iterations between reliable updates
+
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  LinearOperatorBase<FieldF> &Linop_f; //single precision
+
+  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
+				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
+				       int _ReliableUpdateFreq
+				       ) : 
+    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
+  { 
+    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
+  {
+    GridBase *grid = src.Grid();
+    int nshift = shifts.order;
+    std::vector<FieldD> results(nshift,grid);
+    (*this)(Linop,src,results,psi);
+  }
+  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
+  {
+    int nshift = shifts.order;
+
+    (*this)(Linop,src,results);
+  
+    psi = shifts.norm*src;
+    for(int i=0;i<nshift;i++){
+      psi = psi + shifts.residues[i]*results[i];
+    }
+
+    return;
+  }
+
+  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
+  { 
+    GridBase *DoublePrecGrid = src_d.Grid();
+
+    ////////////////////////////////////////////////////////////////////////
+    // Convenience references to the info stored in "MultiShiftFunction"
+    ////////////////////////////////////////////////////////////////////////
+    int nshift = shifts.order;
+
+    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+    std::vector<RealD> &mresidual(shifts.tolerances);
+    std::vector<RealD> alpha(nshift,1.0);
+
+    //Double precision search directions
+    FieldD p_d(DoublePrecGrid);
+    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
+
+    FieldD tmp_d(DoublePrecGrid);
+    FieldD r_d(DoublePrecGrid);
+    FieldD mmp_d(DoublePrecGrid);
+
+    assert(psi_d.size()==nshift);
+    assert(mass.size()==nshift);
+    assert(mresidual.size()==nshift);
+  
+    // dynamic sized arrays on stack; 2d is a pain with vector
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
+  
+    const int       primary =0;
+  
+    //Primary shift fields CG iteration
+    RealD a,b,c,d;
+    RealD cp,bp,qq; //prev
+  
+    // Matrix mult fields
+    FieldF r_f(SinglePrecGrid);
+    FieldF p_f(SinglePrecGrid);
+    FieldF tmp_f(SinglePrecGrid);
+    FieldF mmp_f(SinglePrecGrid);
+    FieldF src_f(SinglePrecGrid);
+    precisionChange(src_f, src_d);
+
+    // Check lightest mass
+    for(int s=0;s<nshift;s++){
+      assert( mass[s]>= mass[primary] );
+      converged[s]=0;
+    }
+  
+    // Wire guess to zero
+    // Residuals "r" are src
+    // First search direction "p" is also src
+    cp = norm2(src_d);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi_d[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
+    for(int s=0;s<nshift;s++){
+      rsq[s] = cp * mresidual[s] * mresidual[s];
+      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
+      ps_d[s] = src_d;
+    }
+    // r and p for primary
+    r_f=src_f; //residual maintained in single
+    p_f=src_f;
+    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
+  
+    //MdagM+m[0]
+    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
+    axpy(mmp_f,mass[0],p_f,mmp_f);
+    RealD rn = norm2(p_f);
+    d += rn*mass[0];
+
+    b = -cp /d;
+  
+    // Set up the various shift variables
+    int       iz=0;
+    z[0][1-iz] = 1.0;
+    z[0][iz]   = 1.0;
+    bs[0]      = b;
+    for(int s=1;s<nshift;s++){
+      z[s][1-iz] = 1.0;
+      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+      bs[s]      = b*z[s][iz]; 
+    }
+  
+    // r += b[0] A.p[0]
+    // c= norm(r)
+    c=axpy_norm(r_f,b,mmp_f,r_f);
+  
+    for(int s=0;s<nshift;s++) {
+      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
+    }
+  
+    ///////////////////////////////////////
+    // Timers
+    ///////////////////////////////////////
+    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
+
+    SolverTimer.Start();
+  
+    // Iteration loop
+    int k;
+  
+    for (k=1;k<=MaxIterations;k++){    
+      a = c /cp;
+
+      //Update double precision search direction by residual
+      PrecChangeTimer.Start();
+      precisionChange(r_d, r_f);
+      PrecChangeTimer.Stop();
+
+      AXPYTimer.Start();
+      axpy(p_d,a,p_d,r_d); 
+
+      for(int s=0;s<nshift;s++){
+	if ( ! converged[s] ) { 
+	  if (s==0){
+	    axpy(ps_d[s],a,ps_d[s],r_d);
+	  } else{
+	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
+	  }
+	}
+      }
+      AXPYTimer.Stop();
+
+      PrecChangeTimer.Start();
+      precisionChange(p_f, p_d); //get back single prec search direction for linop
+      PrecChangeTimer.Stop();
+
+      cp=c;
+      MatrixTimer.Start();  
+      Linop_f.HermOp(p_f,mmp_f); 
+      d=real(innerProduct(p_f,mmp_f));    
+      MatrixTimer.Stop();  
+
+      AXPYTimer.Start();
+      axpy(mmp_f,mass[0],p_f,mmp_f);
+      AXPYTimer.Stop();
+      RealD rn = norm2(p_f);
+      d += rn*mass[0];
+    
+      bp=b;
+      b=-cp/d;
+    
+      // Toggle the recurrence history
+      bs[0] = b;
+      iz = 1-iz;
+      ShiftTimer.Start();
+      for(int s=1;s<nshift;s++){
+	if((!converged[s])){
+	  RealD z0 = z[s][1-iz];
+	  RealD z1 = z[s][iz];
+	  z[s][iz] = z0*z1*bp
+	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+	}
+      }
+      ShiftTimer.Stop();
+
+      //Update double precision solutions
+      AXPYTimer.Start();
+      for(int s=0;s<nshift;s++){
+	int ss = s;
+	if( (!converged[s]) ) { 
+	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
+	}
+      }
+
+      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
+      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
+      AXPYTimer.Stop();
+
+      c = c_f;
+
+      if(k % ReliableUpdateFreq == 0){
+	//Replace r with true residual
+	MatrixTimer.Start();  
+	Linop_d.HermOp(psi_d[0],mmp_d); 
+	MatrixTimer.Stop();  
+
+	AXPYTimer.Start();
+	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
+
+	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
+	AXPYTimer.Stop();
+
+	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
+	
+	PrecChangeTimer.Start();
+	precisionChange(r_f, r_d);
+	PrecChangeTimer.Stop();
+	c = c_d;
+      }
+    
+      // Convergence checks
+      int all_converged = 1;
+      for(int s=0;s<nshift;s++){
+      
+	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
+	
+	  RealD css  = c * z[s][iz]* z[s][iz];
+	
+	  if(css<rsq[s]){
+	    if ( ! converged[s] )
+	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    converged[s]=1;
+	  } else {
+	    all_converged=0;
+	  }
+
+	}
+      }
+
+      if ( all_converged ){
+
+	SolverTimer.Stop();
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+      
+	// Check answers 
+	for(int s=0; s < nshift; s++) { 
+	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
+	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
+	  axpy(r_d,-alpha[s],src_d,tmp_d);
+	  RealD rn = norm2(r_d);
+	  RealD cn = norm2(src_d);
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
+
+	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
+	  if(rn >= rsq[s]){
+	    CleanupTimer.Start();
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
+
+	    //Setup linear operators for final cleanup
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
+	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
+					       
+	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
+	    cg(src_d, psi_d[s]);
+	    
+	    TrueResidualShift[s] = cg.TrueResidual;
+	    CleanupTimer.Stop();
+	  }
+	}
+
+	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
+	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
+	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
+
+	IterationsToComplete = k;	
+
+	return;
+      }
+
+   
+    }
+    // ugly hack
+    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
+    //  assert(0);
+  }
+
+};
+NAMESPACE_END(Grid);
+#endif
@@ -113,43 +113,7 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-
-  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
-    int Nevec = (int)evec_coarse.size();
-    int Nsrc = (int)src.size();
-    // make temp variables
-    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
-    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
-    //Preporcessing
-    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    guess_coarse[j] = Zero();
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockProject(src_coarse[j],src[j],subspace);
-    }
-    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
-    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
-    for (int i=0;i<Nevec;i++)
-    {
-      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
-      const CoarseField & tmp = evec_coarse[i];
-      for (int j=0;j<Nsrc;j++)
-      {
-        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
-      }
-    }
-    //postprocessing
-    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockPromote(guess_coarse[j],guess[j],subspace);
-    guess[j].Checkerboard() = src[j].Checkerboard();
-    }
-  };
-
-  };
+};



@@ -44,6 +44,7 @@ public:
 				  int, MinRes);    // Must restart
 };

+//This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -155,6 +156,7 @@ public:
      _coarse_relax_tol(coarse_relax_tol)  
  {    };

+  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -181,8 +183,16 @@ public:
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+
+  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
+  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
+
+  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
+  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
+  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
+  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
+    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -201,13 +211,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
+    
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
 	     <<std::endl;
-    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -285,6 +295,10 @@ public:
    evals_coarse.resize(0);
  };

+  //The block inner product is the inner product on the fine grid locally summed over the blocks
+  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
+  //vectors under the block inner product. This step must be performed after computing the fine grid
+  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -328,6 +342,8 @@ public:
    }
  }

+  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
+  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -376,25 +392,31 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
+
+
+  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
+  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
+  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////

-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);

    CoarseField src(_CoarseGrid);     src=1.0; 

+    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -405,6 +427,14 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
+
+  //Get the fine eigenvector 'i' by reconstruction
+  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
+    blockPromote(evec_coarse[i],evec,subspace);  
+    eval = evals_coarse[i];
+  }
+    
+    
 };

 NAMESPACE_END(Grid);
@@ -29,6 +29,8 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
+
+      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
@@ -53,10 +53,11 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
-  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
-  Coordinate _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
+  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
+  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
+  Coordinate _processor_coor;  // linear processor coordinate
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@@ -97,8 +98,9 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
+  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
-  int                      ProcessorCount(void)    ;
+  int                ProcessorCount(void)    ;

  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@@ -142,16 +144,16 @@ public:
 		      int bytes);
  
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,
+			       int xmit_to_rank,int do_xmit,
 			       void *recv,
-			       int recv_from_rank,
+			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);

  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,
+				    int xmit_to_rank,int do_xmit,
 				    void *recv,
-				    int recv_from_rank,
+				    int recv_from_rank,int do_recv,
 				    int bytes,int dir);
  
  
@@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@@ -124,12 +124,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-
+  Coordinate shm_processors       (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
+    shm_processors       [pad+d]=parent._shm_processors[d];
  }

  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -154,6 +155,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
+    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }

  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@@ -335,22 +337,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest,
+						     int dest, int dox,
 						     void *recv,
-						     int from,
+						     int from, int dor,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,
+							 int dest,int dox,
 							 void *recv,
-							 int from,
+							 int from,int dor,
 							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size();
@@ -370,31 +372,35 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;

-  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-    tag= dir+from*32;
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-    assert(ierr==0);
-    list.push_back(rrq);
-    off_node_bytes+=bytes;
+  if ( dox ) {
+    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+from*32;
+      ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      assert(ierr==0);
+      list.push_back(rrq);
+      off_node_bytes+=bytes;
+    }
  }
-
-  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-    tag= dir+_processor*32;
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-    assert(ierr==0);
-    list.push_back(xrq);
-    off_node_bytes+=bytes;
-  } else {
+  
+  if (dor) {
+    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+      tag= dir+_processor*32;
+      ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      assert(ierr==0);
+      list.push_back(xrq);
+      off_node_bytes+=bytes;
+    } else {
    // TODO : make a OMP loop on CPU, call threaded bcopy
-    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-    assert(shm!=NULL);
-    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
-    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+      assert(shm!=NULL);
+      //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
+      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+    }
+  }
+  
+  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+    this->StencilSendToRecvFromComplete(list,dir);
  }
-
-  //  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
-  //    this->StencilSendToRecvFromComplete(list,dir);
-  //  }

  return off_node_bytes;
 }
@@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
+  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }

 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
+  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@@ -111,18 +113,18 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }

 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,
+						     int xmit_to_rank,int dox,
 						     void *recv,
-						     int recv_from_rank,
+						     int recv_from_rank,int dor,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,
+							 int xmit_to_rank,int dox,
 							 void *recv,
-							 int recv_from_rank,
+							 int recv_from_rank,int dor,
 							 int bytes, int dir)
 {
  return 2.0*bytes;
@@ -93,9 +93,10 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
@@ -152,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -165,8 +165,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;

-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
 }
 static inline int divides(int a,int b)
 {
@@ -221,7 +221,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
    dim=(dim+1) %ndimension;
  }
 }
-void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -294,7 +294,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);

  GetShmDims(WorldDims,ShmDims);
-
+  SHM = ShmDims;
+  
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -341,7 +342,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@@ -353,6 +354,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);

  GetShmDims(WorldDims,ShmDims);
+  SHM=ShmDims;
+
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -48,9 +48,10 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }

-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  optimal_comm = WorldComm;
+  SHM = Coordinate(processors.size(),1);
 }

 ////////////////////////////////////////////////////////////////////////////////////////////
@@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
+#include <Grid/lattice/Lattice_crc.h>
@@ -0,0 +1,55 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_crc.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
+{
+  auto ff = localNorm2(f);
+  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
+  typedef typename vobj::tensor_reduced normtype;
+  typedef typename normtype::scalar_object scalar;
+  std::vector<scalar> sff;
+  sliceSum(ff,sff,mu);
+  for(int t=0;t<sff.size();t++){
+    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
+  }
+}
+
+template<class vobj> uint32_t crc(Lattice<vobj> & buf)
+{
+  autoView( buf_v , buf, CpuRead);
+  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
+}
+
+#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
+
+NAMESPACE_END(Grid);
+
+
@@ -28,9 +28,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
-#if defined(GRID_SYCL)
-#include <Grid/lattice/Lattice_reduction_sycl.h>
-#endif

 NAMESPACE_BEGIN(Grid);

@@ -130,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
@@ -139,7 +136,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
@@ -148,7 +145,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
@@ -158,13 +155,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-  Integer osites = arg.Grid()->oSites();
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
-  typename vobj::scalar_object ssum;
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
-  ssum= sum_gpu(&arg_v[0],osites);
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
+  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
  arg.Grid()->GlobalSum(ssum);
@@ -174,7 +171,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_gpu_large(&arg_v[0],osites);
@@ -238,10 +235,11 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
+    
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
-    // This code could read coalesce
+
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, 1,{
 	auto x_l = left_v[ss];
@@ -1,125 +0,0 @@
-NAMESPACE_BEGIN(Grid);
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Possibly promote to double and sum
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
-{
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_objectD sobjD;
-  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
-  sobj identity; zeroit(identity);
-  sobj ret ; 
-
-  Integer nsimd= vobj::Nsimd();
-  
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
-     cgh.parallel_for(cl::sycl::range<1>{osites},
-		      Reduction,
-		      [=] (cl::sycl::id<1> item, auto &sum) {
-      auto osite   = item[0];
-      sum +=Reduce(lat[osite]);
-     });
-   });
-  theGridAccelerator->wait();
-  ret = mysum[0];
-  free(mysum,*theGridAccelerator);
-  sobjD dret; convertType(dret,ret);
-  return dret;
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_tensor(lat,osites);
-}
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_large(lat,osites);
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
-{
-  return sumD_gpu_large(lat,osites);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Return as same precision as input performing reduction in double precision though
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-template <class vobj>
-inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu(lat,osites);
-  return result;
-}
-
-template <class vobj>
-inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::scalar_object sobj;
-  sobj result;
-  result = sumD_gpu_large(lat,osites);
-  return result;
-}
-
-NAMESPACE_END(Grid);
-
-/*
-template<class Double> Double svm_reduce(Double *vec,uint64_t L)
-{
-  Double sumResult; zeroit(sumResult);
-  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
-  Double identity;  zeroit(identity);
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
-     cgh.parallel_for(cl::sycl::range<1>{L},
-		      Reduction,
-		      [=] (cl::sycl::id<1> index, auto &sum) {
-	 sum +=vec[index];
-     });
-   });
-  theGridAccelerator->wait();
-  Double ret = d_sum[0];
-  free(d_sum,*theGridAccelerator);
-  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
-  return ret;
-}
-
-template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
-{
-  typedef typename vobj::vector_type  vector;
-  typedef typename vobj::scalar_type  scalar;
-
-  typedef typename vobj::scalar_typeD scalarD;
-  typedef typename vobj::scalar_objectD sobjD;
-
-  sobjD ret;
-  scalarD *ret_p = (scalarD *)&ret;
-  
-  const int nsimd = vobj::Nsimd();
-  const int words = sizeof(vobj)/sizeof(vector);
-
-  Vector<scalar> buffer(osites*nsimd);
-  scalar *buf = &buffer[0];
-  vector *dat = (vector *)lat;
-
-  for(int w=0;w<words;w++) {
-
-    accelerator_for(ss,osites,nsimd,{
-	int lane = acceleratorSIMTlane(nsimd);
-	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
-    });
-    //Precision change at this point is to late to gain precision
-    ret_p[w] = svm_reduce(buf,nsimd*osites);
-  }
-  return ret;
-}
-*/
@@ -855,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int


 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;

@@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
+GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");

 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
+  GridLogHMC.Active(1);

  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
@@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
+extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;

 std::string demangle(const char* name) ;
@@ -31,7 +31,6 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
-#include <string>
 #include <map>

 #include <pwd.h>
@@ -655,8 +654,7 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    const std::string stNC = std::to_string( Nc ) ;
-    ildgfmt.field          = std::string("su"+stNC+"gauge");
+    ildgfmt.field     = std::string("su3gauge");

    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -873,8 +871,7 @@ class IldgReader : public GridLimeReader {
    } else { 

      assert(found_ildgFormat);
-      const std::string stNC = std::to_string( Nc ) ;
-      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
+      assert ( ildgFormat_.field == std::string("su3gauge") );

      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -882,7 +879,7 @@ class IldgReader : public GridLimeReader {

      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");

      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
@@ -6,8 +6,8 @@

    Copyright (C) 2015

+
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@@ -203,24 +203,20 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  assert( Nc < 4 && Nc > 1 ) ;
+  const int x=0;
+  const int y=1;
+  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    #if Nc == 2
-      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
-      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
-    #else
-      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
-      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
-    #endif
+    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;

 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -282,6 +278,7 @@ struct GaugeSimpleMunger{

 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
+
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -320,8 +317,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<Nc;j++){
+      for(int i=0;i<2;i++){
+	for(int j=0;j<3;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -333,8 +330,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<Nc;j++){
+      for(int i=0;i<2;i++){
+	for(int j=0;j<3;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
@@ -9,7 +9,6 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,8 +30,6 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H

-#include <string>
-
 NAMESPACE_BEGIN(Grid);

 using namespace Grid;
@@ -42,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
-
  typedef Lattice<vLorentzColourMatrixD> GaugeField;

+  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
+  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
+
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -148,17 +147,15 @@ public:

    std::string format(header.floating_point);

-    const int ieee32big = (format == std::string("IEEE32BIG"));
-    const int ieee32    = (format == std::string("IEEE32"));
-    const int ieee64big = (format == std::string("IEEE64BIG"));
-    const int ieee64    = (format == std::string("IEEE64") || \
-			   format == std::string("IEEE64LITTLE"));
+    int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32    = (format == std::string("IEEE32"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    const std::string stNC = std::to_string( Nc ) ;
-    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
+    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -169,7 +166,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
+    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -203,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
      
@@ -214,29 +211,27 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"),
-					std::string ens_id = std::string("UKQCD"),
-					unsigned int sequence_number = 1)
+					std::string ens_label = std::string("DWF"))
  {
-    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
+    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"),
-					std::string ens_id = std::string("UKQCD"),
-					unsigned int sequence_number = 1)
+					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    FieldMetaData header;
-    header.sequence_number = sequence_number;
-    header.ensemble_id     = ens_id;
+    ///////////////////////////////////////////
+    // Following should become arguments
+    ///////////////////////////////////////////
+    header.sequence_number = 1;
+    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
-    header.hdr_version     = "1.0" ;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -250,14 +245,10 @@ public:

    uint64_t offset;

-    // Sod it -- always write NcxNc double
-    header.floating_point  = std::string("IEEE64BIG");
-    const std::string stNC = std::to_string( Nc ) ;
-    if( two_row ) {
-      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
-    } else {
-      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
-    }
+    // Sod it -- always write 3x3 double
+    header.floating_point = std::string("IEEE64BIG");
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+    GaugeSimpleUnmunger<fobj3D,sobj> munge;
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -265,15 +256,8 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    if( two_row ) {
-      Gauge3x2unmunger<fobj2D,sobj> munge;
-      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
-						nersc_csum,scidac_csuma,scidac_csumb);
-    } else {
-      GaugeSimpleUnmunger<fobj3D,sobj> munge;
-      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-						nersc_csum,scidac_csuma,scidac_csumb);
-    }
+    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+					      nersc_csum,scidac_csuma,scidac_csumb);
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -305,7 +289,8 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);

-    uint64_t offset;
+	uint64_t offset;
+  
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@@ -345,7 +330,7 @@ public:

    GridBase *grid = parallel.Grid();

-    uint64_t offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);

    FieldMetaData clone(header);

@@ -16,12 +16,8 @@

 #ifdef __NVCC__
 #pragma push
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
-#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
-#else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
-#endif

 #include "pugixml.h"

@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
+#define GparityFlavourIndex (0)

 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;

+const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
+
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;


+template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;

 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;

+//G-parity flavour matrix
+typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
+typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
+
+typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
+typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
+typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
+
+
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -220,6 +235,16 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
+
+//G-parity flavour vector
+typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
+typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
+typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
+
+typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
+typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
+typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
+
    
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@@ -451,20 +476,9 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
 //template <class Prop, class Ferm>
-#define FAST_FERM_TO_PROP
 template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
-#ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,AcceleratorWrite);
-  autoView(f_v,f,AcceleratorRead);
-  accelerator_for(idx,p_v.oSites(),1,{
-      for(int ss = 0; ss < Ns; ++ss) {
-      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
-	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
-      }}
-    });
-#else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -476,23 +490,12 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 	}
      pokeSpin(p, pjs, j, s);
    }
-#endif
 }
    
 //template <class Prop, class Ferm>
 template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
-#ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,AcceleratorWrite);
-  autoView(f_v,f,AcceleratorRead);
-  accelerator_for(idx,p_v.oSites(),1,{
-      for(int ss = 0; ss < Ns; ++ss) {
-      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
-	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
-      }}
-    });
-#else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -504,7 +507,6 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 	}
      pokeSpin(f, fj, j);
    }
-#endif
 }
    
 //////////////////////////////////////////////
@@ -40,6 +40,29 @@ class Action

 public:
  bool is_smeared = false;
+  RealD deriv_norm_sum;
+  RealD deriv_max_sum;
+  int   deriv_num;
+  RealD deriv_us;
+  RealD S_us;
+  RealD refresh_us;
+  void  reset_timer(void)        {
+    deriv_us = S_us = refresh_us = 0.0;
+    deriv_num=0;
+    deriv_norm_sum = deriv_max_sum=0.0;
+  }
+  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
+  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
+  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
+  RealD deriv_timer(void)        { return deriv_us; };
+  RealD S_timer(void)            { return deriv_us; };
+  RealD refresh_timer(void)      { return deriv_us; };
+  void deriv_timer_start(void)   { deriv_us-=usecond(); }
+  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
+  void refresh_timer_start(void) { refresh_us-=usecond(); }
+  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
+  void S_timer_start(void)       { S_us-=usecond(); }
+  void S_timer_stop(void)        { S_us+=usecond(); }
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
@@ -37,6 +37,10 @@ NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
 NAMESPACE_CHECK(ActionParams);

+#include <Grid/qcd/action/filters/MomentumFilter.h>
+#include <Grid/qcd/action/filters/DirichletFilter.h>
+#include <Grid/qcd/action/filters/DDHMCFilter.h>
+
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
@@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
  Coordinate twists;
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  GparityWilsonImplParams() : twists(Nd, 0) {};
 };
  
@@ -63,9 +64,11 @@ struct StaggeredImplParams {
 				    RealD, hi, 
 				    int,   MaxIter, 
 				    RealD, tolerance, 
+				    RealD, mdtolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq);
+				    int,   BoundsCheckFreq,
+				    RealD, BoundsCheckTol);
    
  // MaxIter and tolerance, vectors??
    
@@ -76,16 +79,62 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20)
+				int _BoundsCheckFreq=20,
+				RealD mdtol    = 1.0e-6,
+				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
+        mdtolerance(mdtol),
 	degree(_degree),
        precision(_precision),
-        BoundsCheckFreq(_BoundsCheckFreq){};
+        BoundsCheckFreq(_BoundsCheckFreq),
+        BoundsCheckTol(_BoundsCheckTol){};
  };
  
+  /*Action parameters for the generalized rational action
+    The approximation is for (M^dag M)^{1/inv_pow}
+    where inv_pow is the denominator of the fractional power.
+    Default inv_pow=2 for square root, making this equivalent to 
+    the OneFlavourRational action
+  */
+    struct RationalActionParams : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
+				    int, inv_pow, 
+				    RealD, lo, //low eigenvalue bound of rational approx
+				    RealD, hi, //high eigenvalue bound of rational approx
+				    int,   MaxIter,  //maximum iterations in msCG
+				    RealD, action_tolerance,  //msCG tolerance in action evaluation
+				    int,   action_degree, //rational approx tolerance in action evaluation
+				    RealD, md_tolerance,  //msCG tolerance in MD integration
+				    int,   md_degree, //rational approx tolerance in MD integration
+				    int,   precision, //precision of floating point arithmetic
+				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
+  // constructor 
+  RationalActionParams(int _inv_pow = 2,
+		       RealD _lo      = 0.0, 
+		       RealD _hi      = 1.0, 
+		       int _maxit     = 1000,
+		       RealD _action_tolerance      = 1.0e-8, 
+		       int _action_degree    = 10,
+		       RealD _md_tolerance      = 1.0e-8, 
+		       int _md_degree    = 10,
+		       int _precision = 64,
+		       int _BoundsCheckFreq=20)
+    : inv_pow(_inv_pow), 
+      lo(_lo),
+      hi(_hi),
+      MaxIter(_maxit),
+      action_tolerance(_action_tolerance),
+      action_degree(_action_degree),
+      md_tolerance(_md_tolerance),
+      md_degree(_md_degree),
+      precision(_precision),
+      BoundsCheckFreq(_BoundsCheckFreq){};
+  };
+
+
 NAMESPACE_END(Grid);

 #endif
@@ -68,16 +68,9 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
-  RealD MassPlus(void) { return mass_plus; };
-  RealD MassMinus(void) { return mass_minus; };
+  virtual RealD Mass(void) { return mass; };
  void  SetMass(RealD _mass) { 
-    mass_plus=mass_minus=_mass; 
-    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
-  } ;
-  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
-    mass_plus=_mass_plus;
-    mass_minus=_mass_minus;
+    mass=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@@ -115,7 +108,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);

  //    protected:
-  RealD mass_plus, mass_minus;
+  RealD mass;

  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
@@ -1,333 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
-
-    Copyright (C) 2017 - 2022
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
-    Author: Mattia Bruno <mattia.bruno@cern.ch>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#pragma once
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
-
-////////////////////////////////////////////
-// Standard Clover
-//   (4+m0) + csw * clover_term
-// Exp Clover
-//   (4+m0) * exp(csw/(4+m0) clover_term)
-//   = (4+m0) + csw * clover_term + ...
-////////////////////////////////////////////
-
-NAMESPACE_BEGIN(Grid);
-
-
-//////////////////////////////////
-// Generic Standard Clover
-//////////////////////////////////
-
-template<class Impl>
-class CloverHelpers: public WilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-
-  typedef WilsonCloverHelpers<Impl> Helpers;
-
-  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
-    GridBase *grid = CloverTerm.Grid();
-    CloverTerm += diag_mass;
-
-    int lvol = grid->lSites();
-    int DimRep = Impl::Dimension;
-    {
-      autoView(CTv,CloverTerm,CpuRead);
-      autoView(CTIv,CloverTermInv,CpuWrite);
-      thread_for(site, lvol, {
-        Coordinate lcoor;
-        grid->LocalIndexToLocalCoor(site, lcoor);
-        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
-        peekLocalSite(Qx, CTv, lcoor);
-
-        for (int j = 0; j < Ns; j++)
-          for (int k = 0; k < Ns; k++)
-            for (int a = 0; a < DimRep; a++)
-              for (int b = 0; b < DimRep; b++){
-                auto zz =  Qx()(j, k)(a, b);
-                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-              }
-
-        EigenInvCloverOp = EigenCloverOp.inverse();
-        for (int j = 0; j < Ns; j++)
-          for (int k = 0; k < Ns; k++)
-            for (int a = 0; a < DimRep; a++)
-              for (int b = 0; b < DimRep; b++)
-                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-               pokeLocalSite(Qxinv, CTIv, lcoor);
-      });
-    }
-  }
-
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    return Helpers::Cmunu(U, lambda, mu, nu);
-  }
-
-};
-
-
-//////////////////////////////////
-// Generic Exp Clover
-//////////////////////////////////
-
-template<class Impl>
-class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-
-  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
-  typedef WilsonCloverHelpers<Impl> Helpers;
-
-  // Can this be avoided?
-  static void IdentityTimesC(const CloverField& in, RealD c) {
-    int DimRep = Impl::Dimension;
-
-    autoView(in_v, in, AcceleratorWrite);
-
-    accelerator_for(ss, in.Grid()->oSites(), 1, {
-      for (int sa=0; sa<Ns; sa++)
-        for (int ca=0; ca<DimRep; ca++)
-          in_v[ss]()(sa,sa)(ca,ca) = c;
-    });
-  }
-
-  static int getNMAX(RealD prec, RealD R) {
-    /* compute stop condition for exponential */
-    int NMAX=1;
-    RealD cond=R*R/2.;
-
-    while (cond*std::exp(R)>prec) {
-      NMAX++;
-      cond*=R/(double)(NMAX+1);
-    }
-    return NMAX;
-  }
-
-  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
-  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
-
-  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
-    GridBase* grid = Clover.Grid();
-    CloverField ExpClover(grid);
-
-    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
-
-    Clover *= (1.0/diag_mass);
-
-    // Taylor expansion, slow but generic
-    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
-    // qN = cN
-    // qn = cn + qn+1 X
-    std::vector<RealD> cn(NMAX+1);
-    cn[0] = 1.0;
-    for (int i=1; i<=NMAX; i++)
-      cn[i] = cn[i-1] / RealD(i);
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * Clover + cn[i];
-
-    // prepare inverse
-    CloverInv = (-1.0)*Clover;
-
-    Clover = ExpClover * diag_mass;
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * CloverInv + cn[i];
-
-    CloverInv = ExpClover * (1.0/diag_mass);
-
-  }
-
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    assert(0);
-    return lambda;
-  }
-
-};
-
-
-//////////////////////////////////
-// Compact Standard Clover
-//////////////////////////////////
-
-
-template<class Impl>
-class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
-                            public WilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-  INHERIT_COMPACT_CLOVER_TYPES(Impl);
-
-  typedef WilsonCloverHelpers<Impl> Helpers;
-  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
-
-  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
-    Clover += diag_mass;
-  }
-
-  static void InvertClover(CloverField& InvClover,
-                            const CloverDiagonalField& diagonal,
-                            const CloverTriangleField& triangle,
-                            CloverDiagonalField&       diagonalInv,
-                            CloverTriangleField&       triangleInv,
-                            bool fixedBoundaries) {
-
-    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
-  }
-
-  // TODO: implement Cmunu for better performances with compact layout, but don't do it
-  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    return Helpers::Cmunu(U, lambda, mu, nu);
-  }
-};
-
-//////////////////////////////////
-// Compact Exp Clover
-//////////////////////////////////
-
-template<class Impl>
-class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
-public:
-
-  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
-  INHERIT_COMPACT_CLOVER_TYPES(Impl);
-
-  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
-  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
-
-  // Can this be avoided?
-  static void IdentityTimesC(const CloverField& in, RealD c) {
-    int DimRep = Impl::Dimension;
-
-    autoView(in_v, in, AcceleratorWrite);
-
-    accelerator_for(ss, in.Grid()->oSites(), 1, {
-      for (int sa=0; sa<Ns; sa++)
-        for (int ca=0; ca<DimRep; ca++)
-          in_v[ss]()(sa,sa)(ca,ca) = c;
-    });
-  }
-
-  static int getNMAX(RealD prec, RealD R) {
-    /* compute stop condition for exponential */
-    int NMAX=1;
-    RealD cond=R*R/2.;
-
-    while (cond*std::exp(R)>prec) {
-      NMAX++;
-      cond*=R/(double)(NMAX+1);
-    }
-    return NMAX;
-  }
-
-  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
-  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
-
-  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
-
-    GridBase* grid = Clover.Grid();
-    CloverField ExpClover(grid);
-
-    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
-
-    Clover *= (1.0/diag_mass);
-
-    // Taylor expansion, slow but generic
-    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
-    // qN = cN
-    // qn = cn + qn+1 X
-    std::vector<RealD> cn(NMAX+1);
-    cn[0] = 1.0;
-    for (int i=1; i<=NMAX; i++)
-      cn[i] = cn[i-1] / RealD(i);
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * Clover + cn[i];
-
-    // prepare inverse
-    CloverInv = (-1.0)*Clover;
-
-    Clover = ExpClover * diag_mass;
-
-    ExpClover = Zero();
-    IdentityTimesC(ExpClover, cn[NMAX]);
-    for (int i=NMAX-1; i>=0; i--)
-      ExpClover = ExpClover * CloverInv + cn[i];
-
-    CloverInv = ExpClover * (1.0/diag_mass);
-
-  }
-
-  static void InvertClover(CloverField& InvClover,
-                            const CloverDiagonalField& diagonal,
-                            const CloverTriangleField& triangle,
-                            CloverDiagonalField&       diagonalInv,
-                            CloverTriangleField&       triangleInv,
-                            bool fixedBoundaries) {
-
-    if (fixedBoundaries)
-    {
-      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
-    }
-    else
-    {
-      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
-    }
-  }
-
-  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    assert(0);
-    return lambda;
-  }
-
-};
-
-
-NAMESPACE_END(Grid);
@@ -31,7 +31,6 @@

 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
-#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

@@ -86,7 +85,7 @@ NAMESPACE_BEGIN(Grid);
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site

-template<class Impl, class CloverHelpers>
+template<class Impl>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
@@ -225,7 +224,7 @@ public:
  RealD csw_t;
  RealD cF;

-  bool fixedBoundaries;
+  bool open_boundaries;

  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
@@ -138,52 +138,38 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;

 // Clover fermions
-template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
-template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;

-typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
-typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
-typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;

-typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
-typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
-typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;

-typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
-
-typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
-typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
-typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
-
-typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;

 // Compact Clover fermions
-template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
-template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
+typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
+typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
+typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;

-typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
-typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
-typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
+typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
+typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
+typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;

-typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
-typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
-typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;

-typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
-typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
-typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
-
-typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
-typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
-typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
-
-typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;

 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
@@ -49,6 +49,8 @@ public:

  virtual FermionField &tmp(void) = 0;

+  virtual void DirichletBlock(Coordinate & _Block) { assert(0); };
+  
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

@@ -30,6 +30,18 @@ directory

 NAMESPACE_BEGIN(Grid);

+/*
+  Policy implementation for G-parity boundary conditions
+
+  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
+  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
+  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
+  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
+  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
+
+  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
+ */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));

-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction

    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));

-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-
+    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);

 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }

+
+  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
+  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
+    autoView(poke_f0_v, poke_f0, CpuRead);
+    autoView(poke_f1_v, poke_f1, CpuRead);
+    autoView(Uds_v, Uds, CpuWrite);
+    thread_foreach(ss,poke_f0_v,{
+	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
+	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
+      });
+  }
+    
+
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -207,14 +232,19 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
   
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-        
-    for(int mu=0;mu<Nd;mu++){
-          
-      LatticeCoordinate(coor,mu);
+
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+    for(int mu=0;mu<Nd-1;mu++){
+
+      if( Params.twists[mu] ){
+	LatticeCoordinate(coor,mu);
+      }
          
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
     
+      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
          
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -260,6 +290,38 @@ public:
        });
      }
    }
+
+    { //periodic / antiperiodic temporal BCs
+      int mu = Nd-1;
+      int L   = GaugeGrid->GlobalDimensions()[mu];
+      int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
+      
+      GaugeLinkField *Upoke = &U;
+
+      if(Params.twists[mu]){ //antiperiodic
+	Utmp =  where(coor == Lmu, -U, U);
+	Upoke = &Utmp;
+      }
+    
+      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
+
+      //Get the barrel-shifted field
+      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
+      Upoke = &Utmp;
+
+      if(Params.twists[mu]){
+	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
+	Upoke = &U;
+      }
+      
+      Uconj = conjugate(*Upoke);
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
+    }
  }
      
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -298,28 +360,48 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
-  
+ 
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-
-    int Ls = Btilde.Grid()->_fdimensions[0];
-        
-    GaugeLinkField tmp(mat.Grid());
-    tmp = Zero();
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    
    {
-      autoView( tmp_v , tmp, CpuWrite);
-      autoView( Atilde_v , Atilde, CpuRead);
-      autoView( Btilde_v , Btilde, CpuRead);
-      thread_for(ss,tmp.Grid()->oSites(),{
-	  for (int s = 0; s < Ls; s++) {
-	    int sF = s + Ls * ss;
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
-	  }
-	});
+      GridBase *GaugeGrid = mat.Grid();
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
+
+      if( Params.twists[mu] ){
+	LatticeCoordinate(coor,mu);
+      }
+
+      autoView( mat_v , mat, AcceleratorWrite);
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
+  	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+  	  zeroit(sum);
+  	  for(int s=0;s<Ls;s++){
+  	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+	      //Flavor 0
+  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
+  	      sum = sum + outerProduct(bb,aa);
+
+  	      //Flavor 1
+  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
+  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
+  	      sum = sum + conjugate(outerProduct(bb,aa));
+  	    }
+  	  }	    
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+  	});
    }
-    PokeIndex<LorentzIndex>(mat, tmp, mu);
-    return;
  }
+
+
+  
+
  
 };

@@ -32,7 +32,6 @@

 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
-#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

@@ -52,7 +51,7 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////

-template<class Impl, class CloverHelpers>
+template <class Impl>
 class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
@@ -209,8 +209,6 @@ public:
 };


-////////////////////////////////////////////////////////
-
 template<class Impl> class CompactWilsonCloverHelpers {
 public:

@@ -47,6 +47,8 @@ class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);

+  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
+
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
@@ -117,19 +117,19 @@ public:
    typedef decltype(coalescedRead(*in))    sobj;
    typedef decltype(coalescedRead(*out0)) hsobj;

-    constexpr unsigned int Nsimd = vobj::Nsimd();
+    unsigned int Nsimd = vobj::Nsimd();
    unsigned int mask = Nsimd >> (type + 1);
    int lane = acceleratorSIMTlane(Nsimd);
    int j0 = lane &(~mask); // inner coor zero
    int j1 = lane |(mask) ; // inner coor one
-    const vobj *vp0 = &in[k];  // out0[j] = merge low bit of type from in[k] and in[m] 
-    const vobj *vp1 = &in[m];  // out1[j] = merge hi  bit of type from in[k] and in[m]
-    const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
-    auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
-    auto sb = coalescedRead(*vp,j1); // lane to read for out 1
+    const vobj *vp0 = &in[k];
+    const vobj *vp1 = &in[m];
+    const vobj *vp = (lane&mask) ? vp1:vp0;
+    auto sa = coalescedRead(*vp,j0);
+    auto sb = coalescedRead(*vp,j1);
    hsobj psa, psb;
-    projector::Proj(psa,sa,mu,dag);  // spin project the result0
-    projector::Proj(psb,sb,mu,dag);  // spin project the result1
+    projector::Proj(psa,sa,mu,dag);
+    projector::Proj(psb,sb,mu,dag);
    coalescedWrite(out0[j],psa);
    coalescedWrite(out1[j],psb);
 #else
@@ -75,6 +75,10 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }

+  int Dirichlet;
+  Coordinate Block; 
+
+  /********** Deprecate timers **********/
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
@@ -173,7 +177,18 @@ public:
 		  GridCartesian         &FourDimGrid,
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
-    
+
+  virtual void DirichletBlock(Coordinate & block)
+  {
+    assert(block.size()==Nd+1);
+    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
+      Dirichlet = 1;
+      Block = block;
+      Stencil.DirichletBlock(block); 
+      StencilEven.DirichletBlock(block); 
+      StencilOdd.DirichletBlock(block);
+    }
+  }
  // Constructors
  /*
    WilsonFermion5D(int simd, 
@@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass_plus(_mass), mass_minus(_mass)
+  mass(_mass)
 { 
 }

@@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass_minus*cee[Ls-1];
+      lower[s] = mass*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass_plus*cee[0];
+      upper[s] = mass*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass_plus*upper[Ls-1];
-  lower[0]   =-mass_minus*lower[0];
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }

@@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass_minus*cs[Ls-1];
+      lower[s] =-mass*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass_plus*cs[0];
+      upper[s] =-mass*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      
-      leem[i]=mass_minus*cee[Ls-1]/bee[0];
+      leem[i]=mass*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
      
-      ueem[i]=mass_plus;
+      ueem[i]=mass;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
      
@@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
 	
  { 
-    Coeff_t delta_d=mass_minus*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@@ -642,10 +642,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-
-  assert(mass_plus == mass_minus);
-  RealD mass = mass_plus;
-  
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -781,8 +777,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);

-  assert(mass_plus == mass_minus);
-  RealD mass = mass_plus;

 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
@@ -32,23 +32,22 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>

-
 NAMESPACE_BEGIN(Grid);
-template<class Impl, class CloverHelpers>
-CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
-                                                                            GridCartesian& Fgrid,
-                                                                            GridRedBlackCartesian& Hgrid,
-                                                                            const RealD _mass,
-                                                                            const RealD _csw_r,
-                                                                            const RealD _csw_t,
-                                                                            const RealD _cF,
-                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
-                                                                            const ImplParams& impl_p)
+template<class Impl>
+CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
+                                                             GridCartesian& Fgrid,
+                                                             GridRedBlackCartesian& Hgrid,
+                                                             const RealD _mass,
+                                                             const RealD _csw_r,
+                                                             const RealD _csw_t,
+                                                             const RealD _cF,
+                                                             const WilsonAnisotropyCoefficients& clover_anisotropy,
+                                                             const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
-  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
@@ -59,85 +58,80 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
-  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
-
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;

  ImportGauge(_Umu);
-  if (fixedBoundaries) {
-    this->BoundaryMaskEven.Checkerboard() = Even;
-    this->BoundaryMaskOdd.Checkerboard() = Odd;
+  if (open_boundaries)
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
-  }
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
-  if(this->fixedBoundaries) ApplyBoundaryMask(out);
+  if(this->open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
-  if(this->fixedBoundaries) {
+  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
@@ -147,16 +141,16 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
@@ -166,27 +160,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
-  if(fixedBoundaries) ApplyBoundaryMask(out);
+  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
-  assert(!fixedBoundaries); // TODO check for changes required for open bc
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+  assert(!open_boundaries); // TODO check for changes required for open bc

  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
@@ -257,7 +251,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force,
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
+      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }

@@ -267,18 +261,18 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force,
  force += clover_force;
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
@@ -291,8 +285,8 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const Fermio
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }

-template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
+template<class Impl>
+void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation

  // Import gauge into base class
@@ -305,7 +299,6 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
-  CloverField TmpInverse(grid);

  // Compute the field strength terms mu>nu
  double t2 = usecond();
@@ -325,30 +318,22 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
-
-  // Instantiate the clover term
-  // - In case of the standard clover the mass term is added
-  // - In case of the exponential clover the clover term is exponentiated
-  double t4 = usecond();
-  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
+  TmpOriginal += this->diag_mass;

  // Convert the data layout of the clover term
-  double t5 = usecond();
+  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);

-  // Modify the clover term at the temporal boundaries in case of open boundary conditions
-  double t6 = usecond();
-  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
+  // Possible modify the boundary values
+  double t5 = usecond();
+  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);

-  // Invert the Clover term
-  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
-  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
-  // TODO: For now this inversion is explictly done on the CPU
-  double t7 = usecond();
-  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
+  // Invert the clover term in the improved layout
+  double t6 = usecond();
+  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);

  // Fill the remaining clover fields
-  double t8 = usecond();
+  double t7 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
@@ -359,19 +344,20 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);

  // Report timings
-  double t9 = usecond();
-
-  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
-  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
+  double t8 = usecond();
+#if 0
+  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
+            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+            << ", allocations = "               << (t2 - t1) / 1e6
+            << ", field strength = "            << (t3 - t2) / 1e6
+            << ", fill clover = "               << (t4 - t3) / 1e6
+            << ", convert = "                   << (t5 - t4) / 1e6
+            << ", boundaries = "                << (t6 - t5) / 1e6
+            << ", inversions = "                << (t7 - t6) / 1e6
+            << ", pick cbs = "                  << (t8 - t7) / 1e6
+            << ", total = "                     << (t8 - t0) / 1e6
+            << std::endl;
+#endif
 }

 NAMESPACE_END(Grid);
@@ -34,8 +34,8 @@

 NAMESPACE_BEGIN(Grid);

-template<class Impl, class CloverHelpers>
-WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
+template<class Impl>
+WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
@@ -74,8 +74,8 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
 }

 // *NOT* EO
-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
+template <class Impl>
+void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());

@@ -89,8 +89,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, Fermion
  out += temp;
 }

-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
+template <class Impl>
+void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());

@@ -104,8 +104,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, Ferm
  out += temp;
 }

-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
+template <class Impl>
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
@@ -131,11 +131,47 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
-   
+  CloverTerm += diag_mass;
+
  double t4 = usecond();
-  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
+  int lvol = _Umu.Grid()->lSites();
+  int DimRep = Impl::Dimension;

  double t5 = usecond();
+  {
+    autoView(CTv,CloverTerm,CpuRead);
+    autoView(CTIv,CloverTermInv,CpuWrite);
+    thread_for(site, lvol, {
+      Coordinate lcoor;
+      grid->LocalIndexToLocalCoor(site, lcoor);
+      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
+      peekLocalSite(Qx, CTv, lcoor);
+      //if (csw!=0){
+      for (int j = 0; j < Ns; j++)
+	for (int k = 0; k < Ns; k++)
+	  for (int a = 0; a < DimRep; a++)
+	    for (int b = 0; b < DimRep; b++){
+	      auto zz =  Qx()(j, k)(a, b);
+	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
+	    }
+      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
+      
+      EigenInvCloverOp = EigenCloverOp.inverse();
+      //std::cout << EigenInvCloverOp << std::endl;
+      for (int j = 0; j < Ns; j++)
+	for (int k = 0; k < Ns; k++)
+	  for (int a = 0; a < DimRep; a++)
+	    for (int b = 0; b < DimRep; b++)
+	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
+      //  }
+      pokeLocalSite(Qxinv, CTIv, lcoor);
+    });
+  }
+
+  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -148,44 +184,48 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um

  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-  double t6 = usecond();
+  double t7 = usecond();

-  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
-  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
+#if 0
+  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
+            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+            << ", allocations = "               << (t2 - t1) / 1e6
+            << ", field strength = "            << (t3 - t2) / 1e6
+            << ", fill clover = "               << (t4 - t3) / 1e6
+            << ", misc = "                      << (t5 - t4) / 1e6
+            << ", inversions = "                << (t6 - t5) / 1e6
+            << ", pick cbs = "                  << (t7 - t6) / 1e6
+            << ", total = "                     << (t7 - t0) / 1e6
+            << std::endl;
+#endif
 }

-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
+template <class Impl>
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }

-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }

-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }

-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }

-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
  CloverField *Clover;
@@ -238,8 +278,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
 } // MooeeInternal

 // Derivative parts unpreconditioned pseudofermions
-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+template <class Impl>
+void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
@@ -309,7 +349,7 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
+      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }

@@ -320,15 +360,15 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
 }

 // Derivative parts
-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }

 // Derivative parts
-template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+template <class Impl>
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
@@ -60,7 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid)
+  _tmp(&FiveDimRedBlackGrid),
+  Dirichlet(0)
 {
  // some assertions
  assert(FiveDimGrid._ndimension==5);
@@ -218,6 +219,14 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
+  if ( Dirichlet ) {
+    std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
+    Coordinate GaugeBlock(Nd);
+    for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
+    std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
+    DirichletFilter<GaugeField> Filter(GaugeBlock);
+    Filter.applyFilter(HUmu);
+  }
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
@@ -4,13 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid

 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc

-Copyright (C) 2022
+Copyright (C) 2015

 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Fabian Joswig <fabian.joswig@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -600,47 +599,11 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
-  if(curr_type != Current::Vector)
-  {
-    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
-    exit(1);
-  }
-
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
-
-  PropagatorField tmp_shifted(UGrid);
-  PropagatorField g5Lg5(UGrid);
-  PropagatorField R(UGrid);
-  PropagatorField gmuR(UGrid);
-
-    Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT,
-  };
-  Gamma gmu=Gamma(Gmu[mu]);
-
-  g5Lg5=g5*q_in_1*g5;
-  tmp_shifted=Cshift(q_in_2,mu,1);
-  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
-  gmuR=gmu*R;
-
-  q_out=adj(g5Lg5)*R;
-  q_out-=adj(g5Lg5)*gmuR;
-
-  tmp_shifted=Cshift(q_in_1,mu,1);
-  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
-  g5Lg5=g5*g5Lg5*g5;
-  R=q_in_2;
-  gmuR=gmu*R;
-
-  q_out-=adj(g5Lg5)*R;
-  q_out-=adj(g5Lg5)*gmuR;
+  assert(0);
 }


@@ -654,51 +617,9 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-  if(curr_type != Current::Vector)
-  {
-    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
-    exit(1);
-  }
-
-  int tshift = (mu == Nd-1) ? 1 : 0;
-  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
-
-  PropagatorField tmp(UGrid);
-  PropagatorField Utmp(UGrid);
-  PropagatorField L(UGrid);
-  PropagatorField zz (UGrid);
-  zz=Zero();
-  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
-
-    Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT,
-  };
-  Gamma gmu=Gamma(Gmu[mu]);
-
-  tmp = Cshift(q_in,mu,1);
-  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
-  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
-  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
-  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
-
-  tmp = q_in *lattice_cmplx;
-  tmp = Cshift(tmp,mu,-1);
-  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
-  tmp = -( Utmp + gmu*Utmp );
-  // Mask the time
-  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
-    unsigned int t0 = 0;
-    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
-  } else {
-    tmp = where((lcoor>=tmin+tshift),tmp,zz);
-  }
-  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
+  assert(0);
 }

 NAMESPACE_END(Grid);
@@ -498,7 +498,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
-     acceleratorFenceComputeStream();
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
@@ -506,13 +505,11 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
-     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
-     acceleratorFenceComputeStream();
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
@@ -9,7 +9,6 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
-    Author: Mattia Bruno <mattia.bruno@cern.ch>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,12 +32,10 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
-#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

 #include "impl.h"
-template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
-template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
+template class CompactWilsonCloverFermion<IMPLEMENTATION>; 

 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -8,8 +8,7 @@

    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-    Author: Mattia Bruno <mattia.bruno@cern.ch>
-    
+
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@@ -32,12 +31,10 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
-#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
-template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 
+template class WilsonCloverFermion<IMPLEMENTATION>; 

 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
@@ -18,10 +18,6 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "

-COMPACT_WILSON_IMPL_LIST=" \
-	   WilsonImplF \
-	   WilsonImplD "
-
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@@ -44,23 +40,13 @@ EOF

 done

-CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"

 for impl in $WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
-done
-done
-
-CC_LIST="CompactWilsonCloverFermionInstantiation"
-
-for impl in $COMPACT_WILSON_IMPL_LIST
-do
-for f in $CC_LIST
-do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done

@@ -77,14 +63,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done

 # overwrite the .cc file in Gparity directories
 for impl in $GDWF_IMPL_LIST
 do
-  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
+  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc 
 done


@@ -98,7 +84,7 @@ for impl in $STAG_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done

@@ -0,0 +1,102 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//--------------------------------------------------------------------
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+////////////////////////////////////////////////////
+// DDHMC filter with sub-block size B[mu]
+////////////////////////////////////////////////////
+
+template<typename GaugeField>
+struct DDHMCFilter: public MomentumFilterBase<GaugeField>
+{
+  Coordinate Block;
+  int Width;
+  
+  DDHMCFilter(const Coordinate &_Block,int _Width=2): Block(_Block) { Width=_Width; }
+
+  void applyFilter(GaugeField &U) const override
+  {
+    GridBase *grid = U.Grid();
+    Coordinate Global=grid->GlobalDimensions();
+    GaugeField zzz(grid); zzz = Zero();
+    LatticeInteger coor(grid); 
+    
+    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
+    ////////////////////////////////////////////////////
+    // Zero BDY layers
+    ////////////////////////////////////////////////////
+    std::cout<<GridLogMessage<<" DDHMC Force Filter Block "<<Block<<" width " <<Width<<std::endl;
+    for(int mu=0;mu<Nd;mu++) {
+
+      Integer B1 = Block[mu];
+      if ( B1 && (B1 <= Global[mu]) ) {
+	LatticeCoordinate(coor,mu);
+
+	////////////////////////////////
+	// OmegaBar - zero all links contained in slice B-1,0 and
+	// mu links connecting to Omega
+	////////////////////////////////
+	if ( Width==1) { 
+	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
+	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	  U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
+	  PokeIndex<LorentzIndex>(U, U_mu, mu);
+	}
+	if ( Width==2) { 
+	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
+	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	  U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu); 
+	  PokeIndex<LorentzIndex>(U, U_mu, mu);
+	}
+	if ( Width==3) { 
+	  U    = where(mod(coor,B1)==Integer(B1-3),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
+	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
+	  U    = where(mod(coor,B1)==Integer(2)   ,zzz,U); 
+	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
+	  U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); 
+	  PokeIndex<LorentzIndex>(U, U_mu, mu);
+	}
+      }
+
+    }
+   
+  }
+};
+
+NAMESPACE_END(Grid);
+
@@ -0,0 +1,71 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//--------------------------------------------------------------------
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<typename MomentaField>
+struct DirichletFilter: public MomentumFilterBase<MomentaField>
+{
+  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
+  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
+
+  typedef iScalar<iScalar<iScalar<vector_type> > >            ScalarType; //complex phase for each site
+  
+  Coordinate Block;
+  
+  DirichletFilter(const Coordinate &_Block): Block(_Block){}
+
+  void applyFilter(MomentaField &P) const override
+  {
+    GridBase *grid = P.Grid();
+    typedef decltype(PeekIndex<LorentzIndex>(P, 0)) LatCM;
+    ////////////////////////////////////////////////////
+    // Zero strictly links crossing between domains
+    ////////////////////////////////////////////////////
+    LatticeInteger coor(grid); 
+    LatCM zz(grid); zz = Zero();
+    for(int mu=0;mu<Nd;mu++) {
+      if ( (Block[mu]) && (Block[mu] < grid->GlobalDimensions()[mu] ) ) {
+	// If costly could provide Grid earlier and precompute masks
+	std::cout << " Dirichlet in mu="<<mu<<std::endl;
+	LatticeCoordinate(coor,mu);
+	auto P_mu = PeekIndex<LorentzIndex>(P, mu);
+	P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
+	PokeIndex<LorentzIndex>(P, P_mu, mu);
+      }
+    }
+  }
+};
+
+
+
+NAMESPACE_END(Grid);
+
@@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid);

 template<typename MomentaField>
 struct MomentumFilterBase{
-  virtual void applyFilter(MomentaField &P) const;
+  virtual void applyFilter(MomentaField &P) const = 0;
 };

 //Do nothing
@@ -69,6 +69,11 @@ public:
    return PeriodicBC::ShiftStaple(Link,mu);
  }

+  //Same as Cshift for periodic BCs
+  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
+    return PeriodicBC::CshiftLink(Link,mu,shift);
+  }
+
  static inline bool isPeriodicGaugeField(void) { return true; }
 };

@@ -110,6 +115,11 @@ public:
      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }

+  //If mu is a conjugate BC direction
+  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
+  //       = U^T_\mu(L-1)  | x_\mu == 0
+  //else
+  //Out(x) = U^dag_\mu(x-mu mod L)
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
  {
@@ -129,6 +139,13 @@ public:
      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }

+
+  //If mu is a conjugate BC direction
+  //Out(x) = S_\mu(x+mu)  | x_\mu != L-1
+  //       = S*_\mu(x+mu)  | x_\mu == L-1
+  //else
+  //Out(x) = S_\mu(x+mu mod L)
+  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
  {
    assert(_conjDirs.size() == Nd);
@@ -138,6 +155,27 @@ public:
      return PeriodicBC::ShiftStaple(Link,mu);
  }

+  //Boundary-aware C-shift of gauge links / gauge transformation matrices
+  //For conjugate BC direction
+  //shift = 1
+  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
+  //       = U*_\mu(0)  | x_\mu == L-1
+  //shift = -1
+  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
+  //       = U*_\mu(L-1)  | x_\mu == 0
+  //else
+  //shift = 1
+  //Out(x) = U_\mu(x+\hat\mu mod L)
+  //shift = -1
+  //Out(x) = U_\mu(x-\hat\mu mod L)
+  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
+    assert(_conjDirs.size() == Nd);
+    if(_conjDirs[mu]) 
+      return ConjugateBC::CshiftLink(Link,mu,shift);
+    else     
+      return PeriodicBC::CshiftLink(Link,mu,shift);
+  }
+
  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
@@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid);
    
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
-    typedef LinkField          ComplexField;
+    typedef Field              ComplexField;
  };
  
  typedef QedGImpl<vComplex> QedGImplR;
@@ -13,6 +13,31 @@ NAMESPACE_BEGIN(Grid);
      std::cout << GridLogMessage << "Pseudofermion action lamda_max "<<lambda_max<<"( bound "<<hi<<")"<<std::endl;
      assert( (lambda_max < hi) && " High Bounds Check on operator failed" );
    }
+
+     template<class Field> void ChebyBoundsCheck(LinearOperatorBase<Field> &HermOp,
+						 Field &GaussNoise,
+						 RealD lo,RealD hi) 
+    {
+      int orderfilter = 1000;
+      Chebyshev<Field> Cheb(lo,hi,orderfilter);
+
+      GridBase *FermionGrid = GaussNoise.Grid();
+
+      Field X(FermionGrid);
+      Field Z(FermionGrid);
+
+      X=GaussNoise;
+      RealD Nx = norm2(X);
+      Cheb(HermOp,X,Z);
+      RealD Nz = norm2(Z);
+
+      std::cout << "************************* "<<std::endl;
+      std::cout << " noise                    = "<<Nx<<std::endl;
+      std::cout << " Cheb x noise             = "<<Nz<<std::endl;
+      std::cout << " Ratio                    = "<<Nz/Nx<<std::endl;
+      std::cout << "************************* "<<std::endl;
+      assert( ((Nz/Nx)<1.0) && " ChebyBoundsCheck ");
+    }
      
    template<class Field> void InverseSqrtBoundsCheck(int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
@@ -40,13 +65,65 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " noise                         = "<<Nx<<std::endl;
-      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
-      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
-      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }

+    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
+       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
+       for noise X (aka GaussNoise).
+       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
+    */
+    template<class Field> void InversePowerBoundsCheck(int inv_pow,
+						       int MaxIter,double tol,
+						       LinearOperatorBase<Field> &HermOp,
+						       Field &GaussNoise,
+						       MultiShiftFunction &ApproxNegPow) 
+    {
+      GridBase *FermionGrid = GaussNoise.Grid();
+
+      Field X(FermionGrid);
+      Field Y(FermionGrid);
+      Field Z(FermionGrid);
+
+      Field tmp1(FermionGrid), tmp2(FermionGrid);
+
+      X=GaussNoise;
+      RealD Nx = norm2(X);
+
+      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
+
+      tmp1 = X;
+      
+      Field* in = &tmp1;
+      Field* out = &tmp2;
+      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
+	msCG(HermOp, *in, *out); //backwards conventions!
+	if(i!=inv_pow-1) std::swap(in, out);
+      }
+      Z = *out;
+
+      RealD Nz = norm2(Z);
+
+      HermOp.HermOp(Z,Y);
+      RealD Ny = norm2(Y);
+
+      X=X-Y;
+      RealD Nd = norm2(X);
+      std::cout << "************************* "<<std::endl;
+      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
+      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
+      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
+      std::cout << "************************* "<<std::endl;
+      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
+    }
+
 NAMESPACE_END(Grid);

@@ -0,0 +1,163 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundaryBoson.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////
+// Two flavour ratio
+///////////////////////////////////////
+template<class ImplD,class ImplF>
+class DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion : public Action<typename ImplD::GaugeField> {
+public:
+  INHERIT_IMPL_TYPES(ImplD);
+
+private:
+  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
+  RealD InnerStoppingCondition;
+  RealD ActionStoppingCondition;
+  RealD DerivativeStoppingCondition;
+  FermionField Phi; // the pseudo fermion field for this trajectory
+public:
+  DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
+    : NumOp(_NumOp), 
+      DerivativeStoppingCondition(_DerivativeTol),
+      ActionStoppingCondition(_ActionTol),
+      InnerStoppingCondition(_InnerTol),
+      Phi(_NumOp.FermionGrid()) {};
+
+  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    return sstream.str();
+  }  
+  
+  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
+  {
+    // P(phi) = e^{- phi^dag P^dag P phi}
+    //
+    // NumOp == P
+    //
+    // Take phi = P^{-1} eta  ; eta = P Phi
+    //
+    // P(eta) = e^{- eta^dag eta}
+    //
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    // 
+    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
+    //
+    RealD scale = std::sqrt(0.5);
+
+    NumOp.tolinner=InnerStoppingCondition;
+    NumOp.tol=ActionStoppingCondition;
+    NumOp.ImportGauge(U);
+
+    FermionField eta(NumOp.FermionGrid());
+
+    gaussian(pRNG,eta);    eta=eta*scale;
+    
+    NumOp.ProjectBoundaryBar(eta);
+    //DumpSliceNorm("eta",eta);
+    NumOp.RInv(eta,Phi);
+
+    //DumpSliceNorm("Phi",Phi);
+
+  };
+
+  //////////////////////////////////////////////////////
+  // S = phi^dag Pdag P phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+
+    NumOp.tolinner=InnerStoppingCondition;
+    NumOp.tol=ActionStoppingCondition;
+    NumOp.ImportGauge(U);
+
+    FermionField Y(NumOp.FermionGrid());
+
+    NumOp.R(Phi,Y);
+
+    RealD action = norm2(Y);
+
+    return action;
+  };
+
+  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
+  {
+    NumOp.tolinner=InnerStoppingCondition;
+    NumOp.tol=DerivativeStoppingCondition;
+    NumOp.ImportGauge(U);
+
+    GridBase *fgrid = NumOp.FermionGrid();
+    GridBase *ugrid = NumOp.GaugeGrid();
+
+    FermionField  X(fgrid);
+    FermionField  Y(fgrid);
+    FermionField  tmp(fgrid);
+
+    GaugeField   force(ugrid);	
+
+    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
+    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
+    FermionField DoidP_Phi(fgrid);    // Vector E in my notes
+    FermionField DobidDddDoidP_Phi(fgrid);    // Vector F in my notes
+    
+    FermionField P_Phi(fgrid);
+    
+    // P term
+    NumOp.dBoundaryBar(Phi,tmp);
+    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
+    NumOp.dBoundary(DobiDdbPhi,tmp);
+    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
+    P_Phi  = Phi - DoiDdDobiDdbPhi;
+    NumOp.ProjectBoundaryBar(P_Phi);
+    
+    // P^dag P term
+    NumOp.dOmegaDagInv(P_Phi,DoidP_Phi); // Vector E
+    NumOp.dBoundaryDag(DoidP_Phi,tmp);
+    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidP_Phi);   // Vector F
+    NumOp.dBoundaryBarDag(DobidDddDoidP_Phi,tmp);
+
+    X = DobiDdbPhi;
+    Y = DobidDddDoidP_Phi;
+    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
+    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
+
+    X = DoiDdDobiDdbPhi;
+    Y = DoidP_Phi;
+    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
+    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
+
+    dSdU *= -1.0;
+
+  };
+};
+
+NAMESPACE_END(Grid);
+
@@ -0,0 +1,158 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////
+// Two flavour ratio
+///////////////////////////////////////
+template<class ImplD,class ImplF>
+class DomainDecomposedBoundaryTwoFlavourPseudoFermion : public Action<typename ImplD::GaugeField> {
+public:
+  INHERIT_IMPL_TYPES(ImplD);
+
+private:
+  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
+  RealD ActionStoppingCondition;
+  RealD DerivativeStoppingCondition;
+  RealD InnerStoppingCondition;
+
+  FermionField Phi; // the pseudo fermion field for this trajectory
+
+  RealD refresh_action;
+public:
+  DomainDecomposedBoundaryTwoFlavourPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol = 1.0e-6 )
+    : DenOp(_DenOp),
+      DerivativeStoppingCondition(_DerivativeTol),
+      ActionStoppingCondition(_ActionTol),
+      InnerStoppingCondition(_InnerTol),
+      Phi(_DenOp.FermionGrid()) {};
+      
+  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourPseudoFermion";}
+
+ 
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    return sstream.str();
+  }  
+  
+  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
+  {
+    // P(phi) = e^{- phi^dag Rdag^-1 R^-1 phi}
+    //
+    // DenOp == R
+    //
+    // Take phi = R eta  ; eta = R^-1 Phi
+    //
+    // P(eta) = e^{- eta^dag eta}
+    //
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    // 
+    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
+    //
+    RealD scale = std::sqrt(0.5);
+
+    DenOp.tolinner=InnerStoppingCondition;
+    DenOp.tol     =ActionStoppingCondition;
+    DenOp.ImportGauge(U);
+
+    FermionField eta(DenOp.FermionGrid());
+
+    gaussian(pRNG,eta);    eta=eta*scale;
+    
+    DenOp.ProjectBoundaryBar(eta);
+    DenOp.R(eta,Phi);
+    //DumpSliceNorm("Phi",Phi);
+    refresh_action = norm2(eta);
+  };
+
+  //////////////////////////////////////////////////////
+  // S = phi^dag Rdag^-1 R^-1 phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+
+    DenOp.tolinner=InnerStoppingCondition;
+    DenOp.tol=ActionStoppingCondition;
+    DenOp.ImportGauge(U);
+
+    FermionField X(DenOp.FermionGrid());
+
+    DenOp.RInv(Phi,X);
+
+    RealD action = norm2(X);
+
+    return action;
+  };
+
+  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
+  {
+    DenOp.tolinner=InnerStoppingCondition;
+    DenOp.tol=DerivativeStoppingCondition;
+    DenOp.ImportGauge(U);
+
+    GridBase *fgrid = DenOp.FermionGrid();
+    GridBase *ugrid = DenOp.GaugeGrid();
+
+    FermionField  X(fgrid);
+    FermionField  Y(fgrid);
+    FermionField  tmp(fgrid);
+
+    GaugeField   force(ugrid);	
+
+    FermionField DiDdb_Phi(fgrid);      // Vector C in my notes
+    FermionField DidRinv_Phi(fgrid);    // Vector D in my notes
+    FermionField Rinv_Phi(fgrid);
+
+//   FermionField RinvDagRinv_Phi(fgrid);
+//   FermionField DdbdDidRinv_Phi(fgrid);
+
+    // R^-1 term
+    DenOp.dBoundaryBar(Phi,tmp);
+    DenOp.Dinverse(tmp,DiDdb_Phi);            // Vector C
+    Rinv_Phi = Phi - DiDdb_Phi;
+    DenOp.ProjectBoundaryBar(Rinv_Phi); 
+ 
+    // R^-dagger R^-1 term
+    DenOp.DinverseDag(Rinv_Phi,DidRinv_Phi); // Vector D
+/*
+    DenOp.dBoundaryBarDag(DidRinv_Phi,DdbdDidRinv_Phi);
+    RinvDagRinv_Phi = Rinv_Phi - DdbdDidRinv_Phi;
+    DenOp.ProjectBoundaryBar(RinvDagRinv_Phi);
+*/
+    X = DiDdb_Phi;
+    Y = DidRinv_Phi;
+    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
+    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
+    DumpSliceNorm("force",dSdU);
+    dSdU *= -1.0;
+  };
+};
+
+NAMESPACE_END(Grid);
+
@@ -0,0 +1,237 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
+
+    Copyright (C) 2021
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////
+// Two flavour ratio
+///////////////////////////////////////
+template<class ImplD,class ImplF>
+class DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion : public Action<typename ImplD::GaugeField> {
+public:
+  INHERIT_IMPL_TYPES(ImplD);
+
+private:
+  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
+  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
+
+  RealD InnerStoppingCondition;
+  RealD ActionStoppingCondition;
+  RealD DerivativeStoppingCondition;
+  
+  FermionField Phi; // the pseudo fermion field for this trajectory
+
+public:
+  DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp, 
+						       SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,
+						       RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
+    : NumOp(_NumOp), DenOp(_DenOp),
+      Phi(_NumOp.PeriodicFermOpD.FermionGrid()),
+      InnerStoppingCondition(_InnerTol),
+      DerivativeStoppingCondition(_DerivativeTol),
+      ActionStoppingCondition(_ActionTol)
+  {};
+      
+  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion";}
+ 
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    return sstream.str();
+  }  
+  
+  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
+  {
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    FermionField eta(NumOp.PeriodicFermOpD.FermionGrid());
+    FermionField tmp(NumOp.PeriodicFermOpD.FermionGrid());
+
+    // P(phi) = e^{- phi^dag P^dag Rdag^-1 R^-1 P phi}
+    //
+    // NumOp == P
+    // DenOp == R
+    //
+    // Take phi = P^{-1} R eta  ; eta = R^-1 P Phi
+    //
+    // P(eta) = e^{- eta^dag eta}
+    //
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    // 
+    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
+    //
+    RealD scale = std::sqrt(0.5);
+
+    gaussian(pRNG,eta);    eta=eta*scale;
+    
+    NumOp.ProjectBoundaryBar(eta);
+    NumOp.tolinner=InnerStoppingCondition;
+    DenOp.tolinner=InnerStoppingCondition;
+    DenOp.tol = ActionStoppingCondition;
+    NumOp.tol = ActionStoppingCondition;
+    DenOp.R(eta,tmp);
+    NumOp.RInv(tmp,Phi);
+    DumpSliceNorm("Phi",Phi);
+
+  };
+
+  //////////////////////////////////////////////////////
+  // S = phi^dag Pdag Rdag^-1 R^-1 P phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    FermionField X(NumOp.PeriodicFermOpD.FermionGrid());
+    FermionField Y(NumOp.PeriodicFermOpD.FermionGrid());
+
+    NumOp.tolinner=InnerStoppingCondition;
+    DenOp.tolinner=InnerStoppingCondition;
+    DenOp.tol = ActionStoppingCondition;
+    NumOp.tol = ActionStoppingCondition;
+    NumOp.R(Phi,Y);
+    DenOp.RInv(Y,X);
+
+    RealD action = norm2(X);
+    //    std::cout << " DD boundary action is " <<action<<std::endl;
+
+    return action;
+  };
+
+  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
+  {
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    GridBase *fgrid = NumOp.PeriodicFermOpD.FermionGrid();
+    GridBase *ugrid = NumOp.PeriodicFermOpD.GaugeGrid();
+
+    FermionField  X(fgrid);
+    FermionField  Y(fgrid);
+    FermionField  tmp(fgrid);
+
+    GaugeField   force(ugrid);	
+
+    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
+    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
+    FermionField DiDdbP_Phi(fgrid);      // Vector C in my notes
+    FermionField DidRinvP_Phi(fgrid);    // Vector D in my notes
+    FermionField DdbdDidRinvP_Phi(fgrid);
+    FermionField DoidRinvDagRinvP_Phi(fgrid);    // Vector E in my notes
+    FermionField DobidDddDoidRinvDagRinvP_Phi(fgrid);    // Vector F in my notes
+    
+    FermionField P_Phi(fgrid);
+    FermionField RinvP_Phi(fgrid);
+    FermionField RinvDagRinvP_Phi(fgrid);
+    FermionField PdagRinvDagRinvP_Phi(fgrid);
+
+    //    RealD action = S(U);
+    NumOp.tolinner=InnerStoppingCondition;
+    DenOp.tolinner=InnerStoppingCondition;
+    DenOp.tol = DerivativeStoppingCondition;
+    NumOp.tol = DerivativeStoppingCondition;
+    
+    // P term
+    NumOp.dBoundaryBar(Phi,tmp);
+    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
+    NumOp.dBoundary(DobiDdbPhi,tmp);
+    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
+    P_Phi  = Phi - DoiDdDobiDdbPhi;
+    NumOp.ProjectBoundaryBar(P_Phi);
+
+    // R^-1 P term
+    DenOp.dBoundaryBar(P_Phi,tmp);
+    DenOp.Dinverse(tmp,DiDdbP_Phi);            // Vector C
+    RinvP_Phi = P_Phi - DiDdbP_Phi;
+    DenOp.ProjectBoundaryBar(RinvP_Phi); // Correct to here
+
+ 
+    // R^-dagger R^-1 P term
+    DenOp.DinverseDag(RinvP_Phi,DidRinvP_Phi); // Vector D
+    DenOp.dBoundaryBarDag(DidRinvP_Phi,DdbdDidRinvP_Phi);
+    RinvDagRinvP_Phi = RinvP_Phi - DdbdDidRinvP_Phi;
+    DenOp.ProjectBoundaryBar(RinvDagRinvP_Phi);
+
+    
+    // P^dag R^-dagger R^-1 P term
+    NumOp.dOmegaDagInv(RinvDagRinvP_Phi,DoidRinvDagRinvP_Phi); // Vector E
+    NumOp.dBoundaryDag(DoidRinvDagRinvP_Phi,tmp);
+    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidRinvDagRinvP_Phi);   // Vector F
+    NumOp.dBoundaryBarDag(DobidDddDoidRinvDagRinvP_Phi,tmp);
+    PdagRinvDagRinvP_Phi = RinvDagRinvP_Phi- tmp;
+    NumOp.ProjectBoundaryBar(PdagRinvDagRinvP_Phi);
+
+    /*
+    std::cout << "S eval  "<< action << std::endl;
+    std::cout << "S - IP1 "<< innerProduct(Phi,PdagRinvDagRinvP_Phi) << std::endl;
+    std::cout << "S - IP2 "<< norm2(RinvP_Phi) << std::endl;
+
+    NumOp.R(Phi,tmp);
+    tmp = tmp - P_Phi;
+    std::cout << "diff1 "<<norm2(tmp) <<std::endl;
+    
+    
+    DenOp.RInv(P_Phi,tmp);
+    tmp = tmp - RinvP_Phi;
+    std::cout << "diff2 "<<norm2(tmp) <<std::endl;
+
+    DenOp.RDagInv(RinvP_Phi,tmp);
+    tmp  = tmp - RinvDagRinvP_Phi;
+    std::cout << "diff3 "<<norm2(tmp) <<std::endl;
+
+    DenOp.RDag(RinvDagRinvP_Phi,tmp);
+    tmp  = tmp - PdagRinvDagRinvP_Phi;
+    std::cout << "diff4 "<<norm2(tmp) <<std::endl;
+    */
+    
+    dSdU=Zero();
+
+    X = DobiDdbPhi;
+    Y = DobidDddDoidRinvDagRinvP_Phi;
+    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
+    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
+
+    X = DoiDdDobiDdbPhi;
+    Y = DoidRinvDagRinvP_Phi;
+    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
+    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
+
+    X = DiDdbP_Phi;
+    Y = DidRinvP_Phi;
+    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
+    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
+
+    dSdU *= -1.0;
+
+  };
+};
+
+NAMESPACE_END(Grid);
+
@@ -44,6 +44,10 @@ NAMESPACE_BEGIN(Grid);
  // Exact one flavour implementation of DWF determinant ratio //
  ///////////////////////////////////////////////////////////////

+  //Note: using mixed prec CG for the heatbath solver in this action class will not work
+  //      because the L, R operators must have their shift coefficients updated throughout the heatbath step
+  //      You will find that the heatbath solver simply won't converge.
+  //      To use mixed precision here use the ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction variant below
  template<class Impl>
  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
  {
@@ -57,37 +61,60 @@ NAMESPACE_BEGIN(Grid);
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> SolverHB;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBL;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBR;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverR;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverR;
      FermionField Phi; // the pseudofermion field for this trajectory

+      RealD norm2_eta; //|eta|^2 where eta is the random gaussian field used to generate the pseudofermion field
+      bool initial_action; //true for the first call to S after refresh, for which the identity S = |eta|^2 holds provided the rational approx is good
    public:

+      //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
+      virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
+	AbstractEOFAFermion<Impl>&op = LorR == 0 ? Lop : Rop;
+	op.RefreshShiftCoefficients(to);
+      }
+
+
+      //Use the same solver for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& CG, 
 					      Params& p, 
 					      bool use_fc=false) 
-	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,p,use_fc) {};
-	
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,CG,p,use_fc) {};
+
+      //Use the same solver for L,R in the heatbath but different solvers elsewhere
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
-					      OperatorFunction<FermionField>& HeatbathCG, 
+					      OperatorFunction<FermionField>& HeatbathCG,
+					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
+					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
+					      Params& p, 
+					      bool use_fc=false)
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,HeatbathCG,HeatbathCG, ActionCGL, ActionCGR, DerivCGL,DerivCGR,p,use_fc) {};
+
+      //Use different solvers for L,R in all cases
+      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
+					      AbstractEOFAFermion<Impl>& _Rop,
+					      OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false) : 
        Lop(_Lop), 
 	Rop(_Rop), 
-	SolverHB(HeatbathCG,false,true),
+	SolverHBL(HeatbathCGL,false,true), SolverHBR(HeatbathCGR,false,true),
 	SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), 
 	DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), 
 	Phi(_Lop.FermionGrid()), 
 	param(p), 
-        use_heatbath_forecasting(use_fc)
+	use_heatbath_forecasting(use_fc),
+	initial_action(false)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);

@@ -97,6 +124,8 @@ NAMESPACE_BEGIN(Grid);
        PowerNegHalf.Init(remez, param.tolerance, true);
      };

+      const FermionField &getPhi() const{ return Phi; }
+
      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }

      virtual std::string LogParameters() {
@@ -117,6 +146,19 @@ NAMESPACE_BEGIN(Grid);
        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
      }

+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+        // P(eta_o) = e^{- eta_o^dag eta_o}
+        //
+        // e^{x^2/2 sig^2} => sig^2 = 0.5.
+        // 
+        RealD scale = std::sqrt(0.5);
+
+        FermionField eta    (Lop.FermionGrid());
+        gaussian(pRNG,eta); eta = eta * scale;
+
+	refresh(U,eta);
+      }
+
      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
@@ -124,12 +166,10 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
-      {
+     void refresh(const GaugeField &U, const FermionField &eta) {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);

-        FermionField eta         (Lop.FermionGrid());
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        FermionField Forecast_src(Lop.FermionGrid());
@@ -140,11 +180,6 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;

-        // Seed with Gaussian noise vector (var = 0.5)
-        RealD scale = std::sqrt(0.5);
-        gaussian(pRNG,eta);
-        eta = eta * scale;
-
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
@@ -160,15 +195,15 @@ NAMESPACE_BEGIN(Grid);
        tmp[1] = Zero();
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Lop.RefreshShiftCoefficients(-gamma_l);
+          heatbathRefreshShiftCoefficients(0, -gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            SolverHB(Lop, CG_src, CG_soln);
+            SolverHBL(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero(); // Just use zero as the initial guess
-            SolverHB(Lop, CG_src, CG_soln);
+	    SolverHBL(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
@@ -187,15 +222,15 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
+	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero();
-            SolverHB(Rop, CG_src, CG_soln);
+            SolverHBR(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
@@ -205,49 +240,117 @@ NAMESPACE_BEGIN(Grid);
        Phi = Phi + tmp[1];

        // Reset shift coefficients for energy and force evals
-        Lop.RefreshShiftCoefficients(0.0);
-        Rop.RefreshShiftCoefficients(-1.0);
+	heatbathRefreshShiftCoefficients(0, 0.0);
+	heatbathRefreshShiftCoefficients(1, -1.0);
+
+	//Mark that the next call to S is the first after refresh
+	initial_action = true;
+

 	// Bounds check
 	RealD EtaDagEta = norm2(eta);
+	norm2_eta = EtaDagEta;
+
 	//	RealD PhiDagMPhi= norm2(eta);

      };

-      void Meofa(const GaugeField& U,const FermionField &phi, FermionField & Mphi) 
+      void Meofa(const GaugeField& U,const FermionField &in, FermionField & out) 
      {
-#if 0
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);

-        FermionField spProj_Phi(Lop.FermionGrid());
-	FermionField mPhi(Lop.FermionGrid());
+        FermionField spProj_in(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-	mPhi = phi;
+	out = in;
 	
        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, -1, Lop.Ls);
-        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        spProj(in, spProj_in, -1, Lop.Ls);
+        Lop.Omega(spProj_in, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverL(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
-	mPhi = mPhi -  Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+	spProj(tmp[0], tmp[1], -1, Lop.Ls);
+
+	out = out -  Lop.k * tmp[1];

        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(Phi, spProj_Phi, 1, Rop.Ls);
-        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
+        spProj(in, spProj_in, 1, Rop.Ls);
+        Rop.Omega(spProj_in, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverR(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
-        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
-#endif
+	spProj(tmp[0], tmp[1], 1, Rop.Ls);
+
+        out = out + Rop.k * tmp[1];
      }

+      //Due to the structure of EOFA, it is no more expensive to compute the inverse of Meofa
+      //To ensure correctness we can simply reuse the heatbath code but use the rational approx
+      //f(x) = 1/x   which corresponds to alpha_0=0,  alpha_1=1,  beta_1=0 => gamma_1=1
+      void MeofaInv(const GaugeField &U, const FermionField &in, FermionField &out) {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField CG_src      (Lop.FermionGrid());
+        FermionField CG_soln     (Lop.FermionGrid());
+        std::vector<FermionField> tmp(2, Lop.FermionGrid());
+
+        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
+	// = 1 * \eta
+        out = in;
+
+        // LH terms:
+        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
+        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
+        spProj(in, tmp[0], -1, Lop.Ls);
+        Lop.Omega(tmp[0], tmp[1], -1, 0);
+        G5R5(CG_src, tmp[1]);
+        {
+          heatbathRefreshShiftCoefficients(0, -1.); //-gamma_1 = -1.
+
+	  CG_soln = Zero(); // Just use zero as the initial guess
+	  SolverHBL(Lop, CG_src, CG_soln);
+
+          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = Lop.k * tmp[0];
+        }
+        Lop.Omega(tmp[1], tmp[0], -1, 1);
+        spProj(tmp[0], tmp[1], -1, Lop.Ls);
+        out = out + tmp[1];
+
+        // RH terms:
+        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
+        //          - \beta_l\gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
+        spProj(in, tmp[0], 1, Rop.Ls);
+        Rop.Omega(tmp[0], tmp[1], 1, 0);
+        G5R5(CG_src, tmp[1]);
+        {
+	  heatbathRefreshShiftCoefficients(1, 0.); //-gamma_1 * beta_1 = 0
+
+	  CG_soln = Zero();
+	  SolverHBR(Rop, CG_src, CG_soln);
+
+          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = - Rop.k * tmp[0];
+        }
+        Rop.Omega(tmp[1], tmp[0], 1, 1);
+        spProj(tmp[0], tmp[1], 1, Rop.Ls);
+        out = out + tmp[1];
+
+        // Reset shift coefficients for energy and force evals
+	heatbathRefreshShiftCoefficients(0, 0.0);
+	heatbathRefreshShiftCoefficients(1, -1.0);
+      };
+
+
+
+
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
@@ -271,7 +374,7 @@ NAMESPACE_BEGIN(Grid);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();

        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
@@ -281,6 +384,26 @@ NAMESPACE_BEGIN(Grid);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();

+	if(initial_action){
+	  //For the first call to S after refresh,  S = |eta|^2. We can use this to ensure the rational approx is good
+	  RealD diff = action - norm2_eta;
+
+	  //S_init = eta^dag M^{-1/2} M M^{-1/2} eta
+	  //S_init - eta^dag eta =  eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta
+
+	  //If approximate solution
+	  //S_init - eta^dag eta =  eta^dag ( [M^{-1/2}+\delta M^{-1/2}] M [M^{-1/2}+\delta M^{-1/2}] - 1 ) eta
+	  //               \approx  eta^dag ( \delta M^{-1/2} M^{1/2} + M^{1/2}\delta M^{-1/2} ) eta
+	  // We divide out |eta|^2 to remove source scaling but the tolerance on this check should still be somewhat higher than the actual approx tolerance
+	  RealD test = fabs(diff)/norm2_eta; //test the quality of the rational approx
+
+	  std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl;
+	  std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << "  expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl;
+
+	  assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
+	  initial_action = false;
+	}
+
        return action;
      };

@@ -329,6 +452,40 @@ NAMESPACE_BEGIN(Grid);
      };
  };

+  template<class ImplD, class ImplF>
+  class ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction : public ExactOneFlavourRatioPseudoFermionAction<ImplD>{
+  public:
+    INHERIT_IMPL_TYPES(ImplD);
+    typedef OneFlavourRationalParams Params;
+
+  private:
+    AbstractEOFAFermion<ImplF>& LopF; // the basic LH operator
+    AbstractEOFAFermion<ImplF>& RopF; // the basic RH operator
+
+  public:
+    
+    virtual std::string action_name() { return "ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction"; }
+    
+    //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
+    virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
+      AbstractEOFAFermion<ImplF> &op = LorR == 0 ? LopF : RopF;
+      op.RefreshShiftCoefficients(to);
+      this->ExactOneFlavourRatioPseudoFermionAction<ImplD>::heatbathRefreshShiftCoefficients(LorR,to);
+    }
+    
+    ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction(AbstractEOFAFermion<ImplF>& _LopF, 
+							     AbstractEOFAFermion<ImplF>& _RopF,
+							     AbstractEOFAFermion<ImplD>& _LopD, 
+							     AbstractEOFAFermion<ImplD>& _RopD,
+							     OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
+							     OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
+							     OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
+							     Params& p, 
+							     bool use_fc=false) : 
+    LopF(_LopF), RopF(_RopF), ExactOneFlavourRatioPseudoFermionAction<ImplD>(_LopD, _RopD, HeatbathCGL, HeatbathCGR, ActionCGL, ActionCGR, DerivCGL, DerivCGR, p, use_fc){}
+  };
+
+
 NAMESPACE_END(Grid);

 #endif
@@ -0,0 +1,372 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators
+    /////////////////////////////////////////////////////////
+
+    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
+           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
+	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
+	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
+
+	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    
+       BIG WARNING:	   
+       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
+       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
+       Thus for DWF the numerator operator is the Pauli-Villars operator
+
+       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
+       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
+    */
+      
+    template<class Impl>
+    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    public:
+
+      INHERIT_IMPL_TYPES(Impl);
+
+      typedef RationalActionParams Params;
+      Params param;
+
+      //For action evaluation
+      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
+
+      //For the MD integration
+      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
+      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
+      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
+      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
+
+    private:
+     
+      FermionOperator<Impl> & NumOp;// the basic operator
+      FermionOperator<Impl> & DenOp;// the basic operator
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+
+      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
+      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
+      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
+	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
+	double error = remez.generateApprox(approx_degree,1,inv_pow);	
+	if(error > CG_tolerance)
+	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
+	
+	approx.Init(remez, CG_tolerance,false);
+	approx_inv.Init(remez, CG_tolerance,true);
+      }
+
+
+    protected:
+      static constexpr bool Numerator = true;
+      static constexpr bool Denominator = false;
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
+	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
+	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
+	msCG(schurOp,in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const GaugeField &U){
+	NumOp.ImportGauge(U);
+	DenOp.ImportGauge(U);
+      }
+      
+    public:
+
+      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+						     FermionOperator<Impl>  &_DenOp, 
+						     const Params & p
+						     ) : 
+	NumOp(_NumOp), 
+	DenOp(_DenOp), 
+	PhiOdd (_NumOp.FermionRedBlackGrid()),
+	PhiEven(_NumOp.FermionRedBlackGrid()),
+	param(p) 
+      {
+	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
+	AlgRemez remez(param.lo,param.hi,param.precision);
+
+	//Generate approximations for action eval
+	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
+	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
+
+	//Generate approximations for MD
+	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
+	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
+	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
+	}else{
+	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
+	  ApproxPowerMD = ApproxPowerAction; 
+	  ApproxNegPowerMD = ApproxNegPowerAction;
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
+
+	  ApproxHalfPowerMD = ApproxHalfPowerAction;
+	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
+	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
+	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
+	}
+
+	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
+      };
+
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
+
+      virtual std::string LogParameters(){
+	std::stringstream sstream;
+	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
+	return sstream.str();
+      }
+
+      //Access the fermion field
+      const FermionField &getPhiOdd() const{ return PhiOdd; }
+      
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+	FermionField eta(NumOp.FermionGrid());	
+
+	// P(eta) \propto e^{- eta^dag eta}
+	//	
+	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
+	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
+	RealD scale = std::sqrt(0.5);
+	gaussian(pRNG,eta);	eta=eta*scale;
+
+	refresh(U,eta);
+      }
+
+      //Allow for manual specification of random field for testing
+      void refresh(const GaugeField &U, const FermionField &eta) {
+
+	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+	//
+	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
+	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
+	//
+	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
+	
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+
+	FermionField etaOdd (NumOp.FermionRedBlackGrid());
+	FermionField etaEven(NumOp.FermionRedBlackGrid());
+	FermionField     tmp(NumOp.FermionRedBlackGrid());
+
+	pickCheckerboard(Even,etaEven,eta);
+	pickCheckerboard(Odd,etaOdd,eta);
+
+	ImportGauge(U);
+
+	// MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
+	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
+
+	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
+	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
+	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
+		
+	assert(NumOp.ConstEE() == 1);
+	assert(DenOp.ConstEE() == 1);
+	PhiEven = Zero();
+	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
+      };
+
+      //////////////////////////////////////////////////////
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //////////////////////////////////////////////////////
+      virtual RealD S(const GaugeField &U) {
+	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
+	ImportGauge(U);
+
+	FermionField X(NumOp.FermionRedBlackGrid());
+	FermionField Y(NumOp.FermionRedBlackGrid());
+
+	// VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
+
+	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
+
+	// Randomly apply rational bounds checks.
+	int rcheck = rand();
+	auto grid = NumOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+
+	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
+	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
+	  FermionField gauss(NumOp.FermionRedBlackGrid());
+	  gauss = PhiOdd;
+	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
+	  HighBoundCheck(MdagM,gauss,param.hi);
+	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
+	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
+	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
+	}
+
+	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
+	RealD action = norm2(Y);
+	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
+
+	return action;
+      };
+
+      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+      //
+      // Here, M is some 5D operator and V is the Pauli-Villars field
+      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
+      //
+      // Need  
+      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
+      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
+      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
+      //
+      // P/Q is expressed as partial fraction expansion: 
+      // 
+      //           a0 + \sum_k ak/(V^dagV + bk) 
+      //  
+      // d[P/Q] is then  
+      //
+      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
+      //  
+      // and similar for N/D. 
+      // 
+      // Need   
+      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
+      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
+      //   
+      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
+      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
+      // 
+      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
+      //  
+
+      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
+	const int n_f  = ApproxNegPowerMD.poles.size();
+	const int n_pv = ApproxHalfPowerMD.poles.size();
+
+	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
+	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
+
+	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
+	FermionField           Y(NumOp.FermionRedBlackGrid());
+
+	GaugeField   tmp(NumOp.GaugeGrid());
+
+	ImportGauge(U);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
+
+	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
+	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
+		
+
+	SchurDifferentiableOperator<Impl> MdagM(DenOp);
+	SchurDifferentiableOperator<Impl> VdagV(NumOp);
+
+
+	RealD ak;
+
+	dSdU = Zero();
+
+	// With these building blocks  
+	//  
+	//       dS/dU = 
+	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
+	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
+	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
+
+	//(1)	
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
+	for(int k=0;k<n_f;k++){
+	  ak = ApproxNegPowerMD.residues[k];
+	  MdagM.Mpc(MfMpvPhi_k[k],Y);
+	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
+	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
+	}
+	
+	//(2)
+	//(3)
+	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
+	for(int k=0;k<n_pv;k++){
+
+          ak = ApproxHalfPowerMD.residues[k];
+	  
+	  VdagV.Mpc(MpvPhi_k[k],Y);
+	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
+	  
+	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
+	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
+	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
+
+	}
+
+	//dSdU = Ta(dSdU);
+	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
+      };
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
+
+    Copyright (C) 2015
+
+    Author: Christopher Kelly <ckelly@bnl.gov>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
+    // cf. GeneralEvenOddRational.h for details
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      
+    template<class ImplD, class ImplF>
+    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
+    private:
+      typedef typename ImplD::FermionField FermionFieldD;
+      typedef typename ImplF::FermionField FermionFieldF;
+
+      FermionOperator<ImplD> & NumOpD;
+      FermionOperator<ImplD> & DenOpD;
+     
+      FermionOperator<ImplF> & NumOpF;
+      FermionOperator<ImplF> & DenOpF;
+
+      Integer ReliableUpdateFreq;
+    protected:
+
+      //Allow derived classes to override the multishift CG
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out);
+      }
+      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
+	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
+	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+
+	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	msCG(schurOpD, in, out_elems, out);
+      }
+      //Allow derived classes to override the gauge import
+      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
+	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
+	precisionChange(Uf, Ud);
+	
+	NumOpD.ImportGauge(Ud);
+	DenOpD.ImportGauge(Ud);
+
+	NumOpF.ImportGauge(Uf);
+	DenOpF.ImportGauge(Uf);
+      }
+      
+    public:
+      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
+							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
+							      const RationalActionParams & p, Integer _ReliableUpdateFreq
+							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
+								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
+      
+      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
+    };
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -40,249 +40,31 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
    public:
-
-      INHERIT_IMPL_TYPES(Impl);
-
      typedef OneFlavourRationalParams Params;
-      Params param;
-
-      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
-      MultiShiftFunction PowerQuarter;
-      MultiShiftFunction PowerNegQuarter;
-
    private:
-     
-      FermionOperator<Impl> & NumOp;// the basic operator
-      FermionOperator<Impl> & DenOp;// the basic operator
-      FermionField PhiEven; // the pseudo fermion field for this trajectory
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
+      static RationalActionParams transcribe(const Params &in){
+	RationalActionParams out;
+	out.inv_pow = 2;
+	out.lo = in.lo;
+	out.hi = in.hi;
+	out.MaxIter = in.MaxIter;
+	out.action_tolerance = out.md_tolerance = in.tolerance;
+	out.action_degree = out.md_degree = in.degree;
+	out.precision = in.precision;
+	out.BoundsCheckFreq = in.BoundsCheckFreq;
+	return out;
+      }

    public:
-
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
-					    FermionOperator<Impl>  &_DenOp, 
-					    Params & p
-					    ) : 
-      NumOp(_NumOp), 
-      DenOp(_DenOp), 
-      PhiOdd (_NumOp.FermionRedBlackGrid()),
-      PhiEven(_NumOp.FermionRedBlackGrid()),
-      param(p) 
-      {
-	AlgRemez remez(param.lo,param.hi,param.precision);
+							FermionOperator<Impl>  &_DenOp, 
+							const Params & p
+							) : 
+	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}

-	// MdagM^(+- 1/2)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-	remez.generateApprox(param.degree,1,2);
-	PowerHalf.Init(remez,param.tolerance,false);
-	PowerNegHalf.Init(remez,param.tolerance,true);
-
-	// MdagM^(+- 1/4)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
-	remez.generateApprox(param.degree,1,4);
-   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-      };
-
-      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
-
-      virtual std::string LogParameters(){
-	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
-	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
-	return sstream.str();
-      }
-      
-      
-      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
-
-	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-	//
-	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
-	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
-	//
-	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
-
-	RealD scale = std::sqrt(0.5);
-
-	FermionField eta(NumOp.FermionGrid());
-	FermionField etaOdd (NumOp.FermionRedBlackGrid());
-	FermionField etaEven(NumOp.FermionRedBlackGrid());
-	FermionField     tmp(NumOp.FermionRedBlackGrid());
-
-	gaussian(pRNG,eta);	eta=eta*scale;
-
-	pickCheckerboard(Even,etaEven,eta);
-	pickCheckerboard(Odd,etaOdd,eta);
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-
-	// MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
-	msCG_M(MdagM,etaOdd,tmp);
-
-	// VdagV^-1/4 MdagM^1/4 eta
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
-	msCG_V(VdagV,tmp,PhiOdd);
-
-	assert(NumOp.ConstEE() == 1);
-	assert(DenOp.ConstEE() == 1);
-	PhiEven = Zero();
-	
-      };
-
-      //////////////////////////////////////////////////////
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	FermionField X(NumOp.FermionRedBlackGrid());
-	FermionField Y(NumOp.FermionRedBlackGrid());
-
-	// VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	msCG_V(VdagV,PhiOdd,X);
-
-	// MdagM^-1/4 VdagV^1/4 Phi
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
-	msCG_M(MdagM,X,Y);
-
-	// Randomly apply rational bounds checks.
-	auto grid = NumOp.FermionGrid();
-        auto r=rand();
-        grid->Broadcast(0,r);
-        if ( (r%param.BoundsCheckFreq)==0 ) { 
-	  FermionField gauss(NumOp.FermionRedBlackGrid());
-	  gauss = PhiOdd;
-	  HighBoundCheck(MdagM,gauss,param.hi);
-	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
-	}
-
-	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
-	RealD action = norm2(Y);
-
-	return action;
-      };
-
-      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
-      //
-      // Here, M is some 5D operator and V is the Pauli-Villars field
-      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
-      //
-      // Need  
-      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
-      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
-      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
-      //
-      // P/Q is expressed as partial fraction expansion: 
-      // 
-      //           a0 + \sum_k ak/(V^dagV + bk) 
-      //  
-      // d[P/Q] is then  
-      //
-      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
-      //  
-      // and similar for N/D. 
-      // 
-      // Need   
-      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
-      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
-      //   
-      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
-      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
-      // 
-      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
-      //  
-
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-
-	const int n_f  = PowerNegHalf.poles.size();
-	const int n_pv = PowerQuarter.poles.size();
-
-	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
-	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
-
-	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
-	FermionField           Y(NumOp.FermionRedBlackGrid());
-
-	GaugeField   tmp(NumOp.GaugeGrid());
-
-	NumOp.ImportGauge(U);
-	DenOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> VdagV(NumOp);
-	SchurDifferentiableOperator<Impl> MdagM(DenOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
-
-	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
-	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
-	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
-
-	RealD ak;
-
-	dSdU = Zero();
-
-	// With these building blocks  
-	//  
-	//       dS/dU = 
-	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
-	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
-	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
-
-	//(1)
-	for(int k=0;k<n_f;k++){
-	  ak = PowerNegHalf.residues[k];
-	  MdagM.Mpc(MfMpvPhi_k[k],Y);
-	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
-	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
-	}
-	
-	//(2)
-	//(3)
-	for(int k=0;k<n_pv;k++){
-
-          ak = PowerQuarter.residues[k];
-	  
-	  VdagV.Mpc(MpvPhi_k[k],Y);
-	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
-	  
-	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
-	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
-	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
-
-	}
-
-	//dSdU = Ta(dSdU);
-
-      };
+      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
    };

 NAMESPACE_END(Grid);
@@ -49,10 +49,12 @@ NAMESPACE_BEGIN(Grid);
      Params param;

      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
+      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerNegQuarter;

+      MultiShiftFunction MDPowerQuarter;
+      MultiShiftFunction MDPowerNegHalf;
    private:
     
      FermionOperator<Impl> & NumOp;// the basic operator
@@ -79,6 +81,10 @@ NAMESPACE_BEGIN(Grid);
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
+
+	// Derive solves different tol
+   	MDPowerQuarter.Init(remez,param.mdtolerance,false);
+	MDPowerNegHalf.Init(remez,param.mdtolerance,true);
      };

      virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
@@ -204,8 +210,8 @@ NAMESPACE_BEGIN(Grid);

      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {

-	const int n_f  = PowerNegHalf.poles.size();
-	const int n_pv = PowerQuarter.poles.size();
+	const int n_f  = MDPowerNegHalf.poles.size();
+	const int n_pv = MDPowerQuarter.poles.size();

 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid());
@@ -224,8 +230,8 @@ NAMESPACE_BEGIN(Grid);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);

-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,MDPowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,MDPowerNegHalf);

 	msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
@@ -244,7 +250,7 @@ NAMESPACE_BEGIN(Grid);

 	//(1)
 	for(int k=0;k<n_f;k++){
-	  ak = PowerNegHalf.residues[k];
+	  ak = MDPowerNegHalf.residues[k];
 	  DenOp.M(MfMpvPhi_k[k],Y);
 	  DenOp.MDeriv(tmp , MfMpvPhi_k[k], Y,DaggerYes );  dSdU=dSdU+ak*tmp;
 	  DenOp.MDeriv(tmp , Y, MfMpvPhi_k[k], DaggerNo );  dSdU=dSdU+ak*tmp;
@@ -254,7 +260,7 @@ NAMESPACE_BEGIN(Grid);
 	//(3)
 	for(int k=0;k<n_pv;k++){

-          ak = PowerQuarter.residues[k];
+          ak = MDPowerQuarter.residues[k];
 	  
 	  NumOp.M(MpvPhi_k[k],Y);
 	  NumOp.MDeriv(tmp,MpvMfMpvPhi_k[k],Y,DaggerYes); dSdU=dSdU+ak*tmp;
@@ -40,6 +40,8 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>

@@ -75,24 +75,22 @@ NAMESPACE_BEGIN(Grid);
          conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
        };

-      virtual std::string action_name(){return "TwoFlavourEvenOddRatioPseudoFermionAction";}
+      virtual std::string action_name(){
+	std::stringstream sstream;
+	sstream<<"TwoFlavourEvenOddRatioPseudoFermionAction det("<<DenOp.Mass()<<") / det("<<NumOp.Mass()<<")";
+	return sstream.str();
+      }

      virtual std::string LogParameters(){
 	std::stringstream sstream;
-	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
+	sstream<< GridLogMessage << "["<<action_name()<<"] -- No further parameters "<<std::endl;
 	return sstream.str();
      } 

      
-      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+      const FermionField &getPhiOdd() const{ return PhiOdd; }

-        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
-        //
-        // NumOp == V
-        // DenOp == M
-        //
-        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
-        //
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@@ -100,12 +98,22 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);

        FermionField eta    (NumOp.FermionGrid());
+        gaussian(pRNG,eta); eta = eta * scale;
+
+	refresh(U,eta);
+      }
+
+      void refresh(const GaugeField &U, const FermionField &eta) {
+
+        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
+        //
+        // NumOp == V
+        // DenOp == M
+        //
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());

-        gaussian(pRNG,eta);
-
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);

@@ -124,10 +132,6 @@ NAMESPACE_BEGIN(Grid);
        // Even det factors
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);
-
-        PhiOdd =PhiOdd*scale;
-        PhiEven=PhiEven*scale;
-        
      };

      //////////////////////////////////////////////////////
@@ -0,0 +1,203 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////
+// Two flavour ratio
+///////////////////////////////////////
+template<class Impl>
+class TwoFlavourRatioEO4DPseudoFermionAction : public Action<typename Impl::GaugeField> {
+public:
+  INHERIT_IMPL_TYPES(Impl);
+
+private:
+  typedef FermionOperator<Impl> FermOp;
+  FermionOperator<Impl> & NumOp;// the basic operator
+  FermionOperator<Impl> & DenOp;// the basic operator
+
+  OperatorFunction<FermionField> &DerivativeSolver;
+  OperatorFunction<FermionField> &DerivativeDagSolver;
+  OperatorFunction<FermionField> &ActionSolver;
+  OperatorFunction<FermionField> &HeatbathSolver;
+
+  FermionField phi4; // the pseudo fermion field for this trajectory
+
+public:
+  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					 FermionOperator<Impl>  &_DenOp, 
+					 OperatorFunction<FermionField> & DS,
+					 OperatorFunction<FermionField> & AS ) : 
+    TwoFlavourRatioEO4DPseudoFermionAction(_NumOp,_DenOp, DS,DS,AS,AS) {};
+  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+					 FermionOperator<Impl>  &_DenOp, 
+					 OperatorFunction<FermionField> & DS,
+					 OperatorFunction<FermionField> & DDS,
+					 OperatorFunction<FermionField> & AS,
+					 OperatorFunction<FermionField> & HS
+				       ) : NumOp(_NumOp),
+					   DenOp(_DenOp),
+					   DerivativeSolver(DS),
+					   DerivativeDagSolver(DDS),
+					   ActionSolver(AS),
+					   HeatbathSolver(HS),
+					   phi4(_NumOp.GaugeGrid())
+  {};
+      
+  virtual std::string action_name(){return "TwoFlavourRatioEO4DPseudoFermionAction";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
+    return sstream.str();
+  }  
+      
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
+
+    // P(phi) = e^{- phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi}
+    //
+    // NumOp == V
+    // DenOp == M
+    //
+    // Take phi = (V^{-1} M)_11 eta  ; eta = (M^{-1} V)_11 Phi
+    //
+    // P(eta) = e^{- eta^dag eta}
+    //
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    // 
+    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
+    //
+    RealD scale = std::sqrt(0.5);
+
+    FermionField eta4(NumOp.GaugeGrid());
+    FermionField eta5(NumOp.FermionGrid());
+    FermionField tmp(NumOp.FermionGrid());
+    FermionField phi5(NumOp.FermionGrid());
+
+    gaussian(pRNG,eta4);
+    NumOp.ImportFourDimPseudoFermion(eta4,eta5);
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(HeatbathSolver);
+
+    DenOp.M(eta5,tmp);               // M eta
+    PrecSolve(NumOp,tmp,phi5);  // phi = V^-1 M eta
+    phi5=phi5*scale;
+    std::cout << GridLogMessage << "4d pf refresh "<< norm2(phi5)<<"\n";
+    // Project to 4d
+    NumOp.ExportFourDimPseudoFermion(phi5,phi4);
+      
+  };
+
+  //////////////////////////////////////////////////////
+  // S = phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    FermionField Y4(NumOp.GaugeGrid());
+    FermionField X(NumOp.FermionGrid());
+    FermionField Y(NumOp.FermionGrid());
+    FermionField phi5(NumOp.FermionGrid());
+	
+    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
+    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(ActionSolver);
+
+    NumOp.ImportFourDimPseudoFermion(phi4,phi5);
+    NumOp.M(phi5,X);              // X= V phi
+    PrecSolve(DenOp,X,Y);    // Y= (MdagM)^-1 Mdag Vdag phi = M^-1 V phi
+    NumOp.ExportFourDimPseudoFermion(Y,Y4);
+
+    RealD action = norm2(Y4);
+
+    return action;
+  };
+
+  //////////////////////////////////////////////////////
+  // dS/du = 2 Re phi^dag (V^dag M^-dag)_11  (M^-1 d V)_11  phi
+  //       - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
+  //////////////////////////////////////////////////////
+  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    FermionField  X(NumOp.FermionGrid());
+    FermionField  Y(NumOp.FermionGrid());
+    FermionField       phi(NumOp.FermionGrid());
+    FermionField      Vphi(NumOp.FermionGrid());
+    FermionField  MinvVphi(NumOp.FermionGrid());
+    FermionField      tmp4(NumOp.GaugeGrid());
+    FermionField  MdagInvMinvVphi(NumOp.FermionGrid());
+
+    GaugeField   force(NumOp.GaugeGrid());	
+
+    //Y=V phi
+    //X = (Mdag V phi
+    //Y = (Mdag M)^-1 Mdag V phi = M^-1 V Phi
+    NumOp.ImportFourDimPseudoFermion(phi4,phi);
+    NumOp.M(phi,Vphi);               //  V phi
+    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(DerivativeSolver);
+    PrecSolve(DenOp,Vphi,MinvVphi);// M^-1 V phi
+    std::cout << GridLogMessage << "4d deriv solve "<< norm2(MinvVphi)<<"\n";
+
+    // Projects onto the physical space and back
+    NumOp.ExportFourDimPseudoFermion(MinvVphi,tmp4);
+    NumOp.ImportFourDimPseudoFermion(tmp4,Y);
+
+    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecDagSolve(DerivativeDagSolver);
+    // X = proj M^-dag V phi
+    // Need an adjoint solve
+    PrecDagSolve(DenOp,Y,MdagInvMinvVphi);
+    std::cout << GridLogMessage << "4d deriv solve dag "<< norm2(MdagInvMinvVphi)<<"\n";
+    
+    // phi^dag (Vdag Mdag^-1) (M^-1 dV)  phi
+    NumOp.MDeriv(force ,MdagInvMinvVphi , phi, DaggerNo );  dSdU=force;
+  
+    // phi^dag (dVdag Mdag^-1) (M^-1 V)  phi
+    NumOp.MDeriv(force , phi, MdagInvMinvVphi ,DaggerYes  );  dSdU=dSdU+force;
+
+    //    - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
+    DenOp.MDeriv(force,MdagInvMinvVphi,MinvVphi,DaggerNo);   dSdU=dSdU-force;
+    DenOp.MDeriv(force,MinvVphi,MdagInvMinvVphi,DaggerYes);  dSdU=dSdU-force;
+
+    dSdU *= -1.0; 
+    //dSdU = - Ta(dSdU);
+    
+  };
+};
+
+NAMESPACE_END(Grid);
+
+
@@ -0,0 +1,6 @@
+#ifndef GRID_GPARITY_H_
+#define GRID_GPARITY_H_
+
+#include<Grid/qcd/gparity/GparityFlavour.h>
+
+#endif
@@ -0,0 +1,34 @@
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
+    GparityFlavour(GparityFlavour::Algebra::SigmaX),
+    GparityFlavour(GparityFlavour::Algebra::SigmaY),
+    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
+    }};
+
+const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
+  GparityFlavour(GparityFlavour::Algebra::Identity),
+  GparityFlavour(GparityFlavour::Algebra::SigmaX),
+  GparityFlavour(GparityFlavour::Algebra::SigmaY),
+  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
+  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
+  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
+}};
+
+const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
+    "SigmaX",
+    "MinusSigmaX",
+    "SigmaY",
+    "MinusSigmaY",
+    "SigmaZ",
+    "MinusSigmaZ",
+    "Identity",
+    "MinusIdentity",
+    "ProjPlus",
+    "MinusProjPlus",
+    "ProjMinus",
+    "MinusProjMinus"}};
+
+NAMESPACE_END(Grid);
@@ -0,0 +1,475 @@
+#ifndef GRID_QCD_GPARITY_FLAVOUR_H
+#define GRID_QCD_GPARITY_FLAVOUR_H
+
+//Support for flavour-matrix operations acting on the G-parity flavour index
+
+#include <array>
+
+NAMESPACE_BEGIN(Grid);
+
+class GparityFlavour {
+  public:
+    GRID_SERIALIZABLE_ENUM(Algebra, undef,
+                           SigmaX, 0,
+			   MinusSigmaX, 1,
+                           SigmaY, 2,
+			   MinusSigmaY, 3,
+                           SigmaZ, 4,
+			   MinusSigmaZ, 5,
+			   Identity, 6,
+			   MinusIdentity, 7,
+			   ProjPlus, 8,
+			   MinusProjPlus, 9,
+			   ProjMinus, 10,
+			   MinusProjMinus, 11
+			   );
+    static constexpr unsigned int nSigma = 12;
+    static const std::array<const char *, nSigma>                name;
+    static const std::array<const GparityFlavour, 3>             sigma_mu;
+    static const std::array<const GparityFlavour, 6>            sigma_all;
+    Algebra                                                      g;
+  public:
+  accelerator GparityFlavour(Algebra initg): g(initg) {}  
+};
+
+
+
+// 0 1  x   vector
+// 1 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(1);
+  ret(1) = rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(1,0);
+  ret(0,1) = rhs(1,1);
+  ret(1,0) = rhs(0,0);
+  ret(1,1) = rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,1);
+  ret(0,1) = rhs(0,0);
+  ret(1,0) = rhs(1,1);
+  ret(1,1) = rhs(1,0);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaX(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(1);
+  ret(1) = -rhs(0);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(1,0);
+  ret(0,1) = -rhs(1,1);
+  ret(1,0) = -rhs(0,0);
+  ret(1,1) = -rhs(0,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaX(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,1);
+  ret(0,1) = -rhs(0,0);
+  ret(1,0) = -rhs(1,1);
+  ret(1,1) = -rhs(1,0);
+};
+
+
+
+
+
+// 0 -i  x   vector
+// i 0
+template<class vtype>
+accelerator_inline void multFlavourSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesMinusI(rhs(1));
+  ret(1) = timesI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(1,0));
+  ret(0,1) = timesMinusI(rhs(1,1));
+  ret(1,0) = timesI(rhs(0,0));
+  ret(1,1) = timesI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(0,1));
+  ret(0,1) = timesMinusI(rhs(0,0));
+  ret(1,0) = timesI(rhs(1,1));
+  ret(1,1) = timesMinusI(rhs(1,0));
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaY(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = timesI(rhs(1));
+  ret(1) = timesMinusI(rhs(0));
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesI(rhs(1,0));
+  ret(0,1) = timesI(rhs(1,1));
+  ret(1,0) = timesMinusI(rhs(0,0));
+  ret(1,1) = timesMinusI(rhs(0,1));
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaY(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = timesMinusI(rhs(0,1));
+  ret(0,1) = timesI(rhs(0,0));
+  ret(1,0) = timesMinusI(rhs(1,1));
+  ret(1,1) = timesI(rhs(1,0));
+};
+
+
+
+
+
+// 1 0  x   vector
+// 0 -1
+template<class vtype>
+accelerator_inline void multFlavourSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusSigmaZ(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+
+
+
+
+
+template<class vtype>
+accelerator_inline void multFlavourIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = rhs(0);
+  ret(1) = rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = rhs(0,0);
+  ret(0,1) = rhs(0,1);
+  ret(1,0) = rhs(1,0);
+  ret(1,1) = rhs(1,1);
+};
+
+template<class vtype>
+accelerator_inline void multFlavourMinusIdentity(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -rhs(0);
+  ret(1) = -rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusIdentity(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -rhs(0,0);
+  ret(0,1) = -rhs(0,1);
+  ret(1,0) = -rhs(1,0);
+  ret(1,1) = -rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1+\sigma_2)
+//1 -i
+//i  1
+template<class vtype>
+accelerator_inline void multFlavourProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjPlus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjPlus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+//G-parity flavour projection 1/2(1-\sigma_2)
+//1 i
+//-i  1
+template<class vtype>
+accelerator_inline void multFlavourProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1));
+  ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0));
+  ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1));
+  ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1));
+  ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1);
+  ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1);
+};
+
+
+template<class vtype>
+accelerator_inline void multFlavourMinusProjMinus(iVector<vtype, Ngp> &ret, const iVector<vtype, Ngp> &rhs)
+{
+  ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1));
+  ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1);
+};
+template<class vtype>
+accelerator_inline void lmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0));
+  ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1));
+  ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0);
+  ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1);
+};
+template<class vtype>
+accelerator_inline void rmultFlavourMinusProjMinus(iMatrix<vtype, Ngp> &ret, const iMatrix<vtype, Ngp> &rhs)
+{
+  ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1));
+  ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1);
+  ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1));
+  ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1);
+};
+
+
+
+
+
+
+
+
+
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Ngp>, GparityFlavourTensorIndex>::value, iVector<vtype, Ngp>>::type
+{
+  iVector<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    multFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    multFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    multFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    multFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    multFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    multFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    multFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    multFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    multFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    multFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    multFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    multFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+ 
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype, Ngp> &arg)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    lmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    lmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    lmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    lmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    lmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    lmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    lmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    lmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    lmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    lmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    lmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    lmultFlavourMinusProjMinus(ret, arg); break;  
+  default: assert(0);
+  }
+  
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityFlavour &G)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Ngp>, GparityFlavourTensorIndex>::value, iMatrix<vtype, Ngp>>::type
+{
+  iMatrix<vtype, Ngp> ret;
+
+  switch (G.g) 
+  {
+  case GparityFlavour::Algebra::SigmaX:
+    rmultFlavourSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaX:
+    rmultFlavourMinusSigmaX(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaY:
+    rmultFlavourSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaY:
+    rmultFlavourMinusSigmaY(ret, arg); break;
+  case GparityFlavour::Algebra::SigmaZ:
+    rmultFlavourSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::MinusSigmaZ:
+    rmultFlavourMinusSigmaZ(ret, arg); break;
+  case GparityFlavour::Algebra::Identity:
+    rmultFlavourIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::MinusIdentity:
+    rmultFlavourMinusIdentity(ret, arg); break;
+  case GparityFlavour::Algebra::ProjPlus:
+    rmultFlavourProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjPlus:
+    rmultFlavourMinusProjPlus(ret, arg); break;
+  case GparityFlavour::Algebra::ProjMinus:
+    rmultFlavourProjMinus(ret, arg); break;
+  case GparityFlavour::Algebra::MinusProjMinus:
+    rmultFlavourMinusProjMinus(ret, arg); break;
+  default: assert(0);
+  }
+
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif // include guard
@@ -129,18 +129,10 @@ public:
    Runner(S);
  }

-  //////////////////////////////////////////////////////////////////
-
-private:
-  template <class SmearingPolicy>
-  void Runner(SmearingPolicy &Smearing) {
-    auto UGrid = Resources.GetCartesian();
-    Resources.AddRNGs();
-    Field U(UGrid);
-
-    // Can move this outside?
-    typedef IntegratorType<SmearingPolicy> TheIntegrator;
-    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
+  //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
+  //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
+  void initializeGaugeFieldAndRNGs(Field &U){
+    if(!Resources.haveRNGs()) Resources.AddRNGs();

    if (Parameters.StartingType == "HotStart") {
      // Hot start
@@ -159,14 +151,43 @@ private:
      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
 						     Resources.GetSerialRNG(),
 						     Resources.GetParallelRNG());
+    } else if (Parameters.StartingType == "CheckpointStartReseed") {
+      // Same as CheckpointRestart but reseed the RNGs using the fixed integer seeding used for ColdStart and HotStart
+      // Useful for creating new evolution streams from an existing stream
+      
+      // WARNING: Unfortunately because the checkpointer doesn't presently allow us to separately restore the RNG and gauge fields we have to load
+      // an existing RNG checkpoint first; make sure one is available and named correctly
+      Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
+						     Resources.GetSerialRNG(),
+						     Resources.GetParallelRNG());
+      Resources.SeedFixedIntegers();      
    } else {
      // others
      std::cout << GridLogError << "Unrecognized StartingType\n";
      std::cout
 	<< GridLogError
-	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+	<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart, CheckpointStartReseed]\n";
      exit(1);
    }
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////
+
+private:
+  template <class SmearingPolicy>
+  void Runner(SmearingPolicy &Smearing) {
+    auto UGrid = Resources.GetCartesian();
+    Field U(UGrid);
+
+    initializeGaugeFieldAndRNGs(U);
+
+    typedef IntegratorType<SmearingPolicy> TheIntegrator;
+    TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
+
+    // Sets the momentum filter
+    MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter()));

    Smearing.set_Field(U);

@@ -34,6 +34,7 @@ directory
 			    * @brief Classes for Hybrid Monte Carlo update
 			    *
 			    * @author Guido Cossu
+			    * @author Peter Boyle
 			    */
 			   //--------------------------------------------------------------------
 #pragma once
@@ -115,22 +116,17 @@ private:

    random(sRNG, rn_test);

-    std::cout << GridLogMessage
-              << "--------------------------------------------------\n";
-    std::cout << GridLogMessage << "exp(-dH) = " << prob
-              << "  Random = " << rn_test << "\n";
-    std::cout << GridLogMessage
-              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
+    std::cout << GridLogHMC << "--------------------------------------------------\n";
+    std::cout << GridLogHMC << "exp(-dH) = " << prob << "  Random = " << rn_test << "\n";
+    std::cout << GridLogHMC << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";

    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
-      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
-      std::cout << GridLogMessage
-                << "--------------------------------------------------\n";
+      std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogHMC << "--------------------------------------------------\n";
      return true;
    } else {  // rejected
-      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
-      std::cout << GridLogMessage
-                << "--------------------------------------------------\n";
+      std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogHMC << "--------------------------------------------------\n";
      return false;
    }
  }
@@ -139,19 +135,68 @@ private:
  // Evolution
  /////////////////////////////////////////////////////////
  RealD evolve_hmc_step(Field &U) {
-    TheIntegrator.refresh(U, sRNG, pRNG);  // set U and initialize P and phi's

-    RealD H0 = TheIntegrator.S(U);  // initial state action
+    GridBase *Grid = U.Grid();
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Mainly for DDHMC perform a random translation of U modulo volume
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::cout << GridLogMessage << "--------------------------------------------------\n";
+    std::cout << GridLogMessage << "Random shifting gauge field by [";
+    for(int d=0;d<Grid->Nd();d++) {
+
+      int L = Grid->GlobalDimensions()[d];
+
+      RealD rn_uniform;  random(sRNG, rn_uniform);
+
+      int shift = (int) (rn_uniform*L);
+
+      std::cout << shift;
+      if(d<Grid->Nd()-1) std::cout <<",";
+      else               std::cout <<"]\n";
+      
+      U = Cshift(U,d,shift);
+    }
+    std::cout << GridLogMessage << "--------------------------------------------------\n";
+
+    TheIntegrator.reset_timer();
+    
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // set U and initialize P and phi's
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::cout << GridLogMessage << "--------------------------------------------------\n";
+    std::cout << GridLogMessage << "Refresh momenta and pseudofermions";
+    TheIntegrator.refresh(U, sRNG, pRNG);  
+    std::cout << GridLogMessage << "--------------------------------------------------\n";
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // initial state action
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::cout << GridLogMessage << "--------------------------------------------------\n";
+    std::cout << GridLogMessage << "Compute initial action";
+    RealD H0 = TheIntegrator.S(U);  
+    std::cout << GridLogMessage << "--------------------------------------------------\n";

    std::streamsize current_precision = std::cout.precision();
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
    std::cout.precision(current_precision);

+    std::cout << GridLogMessage << "--------------------------------------------------\n";
+    std::cout << GridLogMessage << " Molecular Dynamics evolution ";
    TheIntegrator.integrate(U);
+    std::cout << GridLogMessage << "--------------------------------------------------\n";

-    RealD H1 = TheIntegrator.S(U);  // updated state action
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // updated state action
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::cout << GridLogMessage << "--------------------------------------------------\n";
+    std::cout << GridLogMessage << "Compute final action";
+    RealD H1 = TheIntegrator.S(U);  
+    std::cout << GridLogMessage << "--------------------------------------------------\n";

+
+    
    ///////////////////////////////////////////////////////////
    if(0){
      std::cout << "------------------------- Reversibility test" << std::endl;
@@ -163,17 +208,16 @@ private:
    }
    ///////////////////////////////////////////////////////////

-
    std::cout.precision(15);
-    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
-	      << "  dH = " << H1 - H0 << "\n";
+
+    std::cout << GridLogHMC << "--------------------------------------------------\n";
+    std::cout << GridLogHMC << "Total H after trajectory  = " << H1 << "  dH = " << H1 - H0 << "\n";
+    std::cout << GridLogHMC << "--------------------------------------------------\n";
+
    std::cout.precision(current_precision);
    
    return (H1 - H0);
  }
-  
-
-  

 public:
  /////////////////////////////////////////
@@ -195,10 +239,13 @@ public:

    // Actual updates (evolve a copy Ucopy then copy back eventually)
    unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
+
    for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
-      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+
+      std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
+
      if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
-      	std::cout << GridLogMessage << "-- Thermalization" << std::endl;
+      	std::cout << GridLogHMC << "-- Thermalization" << std::endl;
      }
      
      double t0=usecond();
@@ -207,20 +254,19 @@ public:
      DeltaH = evolve_hmc_step(Ucopy);
      // Metropolis-Hastings test
      bool accept = true;
-      if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
+      if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
        accept = metropolis_test(DeltaH);
      } else {
-      	std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
+      	std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
      }

      if (accept)
        Ucur = Ucopy; 
      
-     
-      
      double t1=usecond();
-      std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
+      std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;

+      TheIntegrator.print_timer();

      for (int obs = 0; obs < Observables.size(); obs++) {
      	std::cout << GridLogDebug << "Observables # " << obs << std::endl;
@@ -228,7 +274,7 @@ public:
      	std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-      std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
+      std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
    }
  }

@@ -80,7 +80,9 @@ public:
      std::cout << GridLogError << "Seeds not initialized" << std::endl;
      exit(1);
    }
+    std::cout << GridLogMessage << "Reseeding serial RNG with seed vector " << SerialSeeds << std::endl;
    sRNG_.SeedFixedIntegers(SerialSeeds);
+    std::cout << GridLogMessage << "Reseeding parallel RNG with seed vector " << ParallelSeeds << std::endl;
    pRNG_->SeedFixedIntegers(ParallelSeeds);
  }
 };
@@ -72,6 +72,8 @@ class HMCResourceManager {
  typedef HMCModuleBase< BaseHmcCheckpointer<ImplementationPolicy> > CheckpointerBaseModule;
  typedef HMCModuleBase< HmcObservable<typename ImplementationPolicy::Field> > ObservableBaseModule;
  typedef ActionModuleBase< Action<typename ImplementationPolicy::Field>, GridModule > ActionBaseModule;
+  typedef typename ImplementationPolicy::Field MomentaField;
+  typedef typename ImplementationPolicy::Field Field;  

  // Named storage for grid pairs (std + red-black)
  std::unordered_map<std::string, GridModule> Grids;
@@ -80,6 +82,9 @@ class HMCResourceManager {
  // SmearingModule<ImplementationPolicy> Smearing;
  std::unique_ptr<CheckpointerBaseModule> CP;

+  // Momentum filter
+  std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> > Filter;
+  
  // A vector of HmcObservable modules
  std::vector<std::unique_ptr<ObservableBaseModule> > ObservablesList;

@@ -90,6 +95,7 @@ class HMCResourceManager {

  bool have_RNG;
  bool have_CheckPointer;
+  bool have_Filter;

  // NOTE: operator << is not overloaded for std::vector<string> 
  // so this function is necessary
@@ -101,7 +107,7 @@ class HMCResourceManager {


 public:
-  HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {}
+  HMCResourceManager() : have_RNG(false), have_CheckPointer(false), have_Filter(false) {}

  template <class ReaderClass, class vector_type = vComplex >
  void initialize(ReaderClass &Read){
@@ -129,6 +135,7 @@ public:
    RNGModuleParameters RNGpar(Read);
    SetRNGSeeds(RNGpar);

+  
    // Observables
    auto &ObsFactory = HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance(); 
    Read.push(observable_string);// here must check if existing...
@@ -208,6 +215,16 @@ public:
    AddGrid(s, Mod);
  }

+  void SetMomentumFilter( MomentumFilterBase<typename ImplementationPolicy::Field> * MomFilter) {
+    assert(have_Filter==false);
+    Filter = std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> >(MomFilter);
+    have_Filter = true;
+  }
+  MomentumFilterBase<typename ImplementationPolicy::Field> *GetMomentumFilter(void) {
+    if ( !have_Filter)
+      SetMomentumFilter(new MomentumFilterNone<typename ImplementationPolicy::Field>());
+    return Filter.get();
+  }

  GridCartesian* GetCartesian(std::string s = "") {
    if (s.empty()) s = Grids.begin()->first;
@@ -226,6 +243,9 @@ public:
  //////////////////////////////////////////////////////
  // Random number generators
  //////////////////////////////////////////////////////
+  
+  //Return true if the RNG objects have been instantiated
+  bool haveRNGs() const{ return have_RNG; }

  void AddRNGs(std::string s = "") {
    // Couple the RNGs to the GridModule tagged by s
@@ -33,7 +33,6 @@ directory
 #define INTEGRATOR_INCLUDED

 #include <memory>
-#include "MomentumFilter.h"

 NAMESPACE_BEGIN(Grid);

@@ -67,6 +66,7 @@ public:
 template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy>
 class Integrator {
 protected:
+
  typedef typename FieldImplementation::Field MomentaField;  //for readability
  typedef typename FieldImplementation::Field Field;

@@ -119,36 +119,58 @@ protected:
    }
  } update_P_hireps{};

+ 
  void update_P(MomentaField& Mom, Field& U, int level, double ep) {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing

    for (int a = 0; a < as[level].actions.size(); ++a) {
+
      double start_full = usecond();
      Field force(U.Grid());
      conformable(U.Grid(), Mom.Grid());

      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      double start_force = usecond();
+      as[level].actions.at(a)->deriv_timer_start();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
+      as[level].actions.at(a)->deriv_timer_stop();

      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
+      auto name = as[level].actions.at(a)->action_name();
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
+
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
      double end_force = usecond();
-      Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
+
+      MomFilter->applyFilter(force);
+      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl;
+      //      DumpSliceNorm("force ",force,Nd-1);
+      
+      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
+      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      Real force_max   = std::sqrt(maxLocalNorm2(force));
+      Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;    
+
+      as[level].actions.at(a)->deriv_log(force_abs,force_max);
+      
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl;
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average  : " << impulse_abs <<" "<<name<<std::endl;
+      std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max      : " << impulse_max <<" "<<name<<std::endl;
+
      Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;; 
      double end_full = usecond();
      double time_full  = (end_full - start_full) / 1e3;
      double time_force = (end_force - start_force) / 1e3;
      std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
+
    }

    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);

-    MomFilter->applyFilter(Mom);
  }

  void update_U(Field& U, double ep) 
@@ -162,8 +184,12 @@ protected:
  
  void update_U(MomentaField& Mom, Field& U, double ep) 
  {
+    MomentaField MomFiltered(Mom.Grid());
+    MomFiltered = Mom;
+    MomFilter->applyFilter(MomFiltered);
+
    // exponential of Mom*U in the gauge fields case
-    FieldImplementation::update_field(Mom, U, ep);
+    FieldImplementation::update_field(MomFiltered, U, ep);

    // Update the smeared fields, can be implemented as observer
    Smearer.set_Field(U);
@@ -206,6 +232,66 @@ public:
  const MomentaField & getMomentum() const{ return P; }
  

+  void reset_timer(void)
+  {
+    for (int level = 0; level < as.size(); ++level) {
+      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
+        as[level].actions.at(actionID)->reset_timer();
+      }
+    }
+  }
+  void print_timer(void)
+  {
+    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::" << std::endl;
+    std::cout << GridLogMessage << " Refresh cumulative timings "<<std::endl;
+    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
+    for (int level = 0; level < as.size(); ++level) {
+      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
+	std::cout << GridLogMessage 
+		  << as[level].actions.at(actionID)->action_name()
+		  <<"["<<level<<"]["<< actionID<<"] "
+		  << as[level].actions.at(actionID)->refresh_us*1.0e-6<<" s"<< std::endl;
+      }
+    }
+    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
+    std::cout << GridLogMessage << " Action cumulative timings "<<std::endl;
+    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
+    for (int level = 0; level < as.size(); ++level) {
+      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
+	std::cout << GridLogMessage 
+		  << as[level].actions.at(actionID)->action_name()
+		  <<"["<<level<<"]["<< actionID<<"] "
+		  << as[level].actions.at(actionID)->S_us*1.0e-6<<" s"<< std::endl;
+      }
+    }
+    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
+    std::cout << GridLogMessage << " Force cumulative timings "<<std::endl;
+    std::cout << GridLogMessage << "------------------------- "<<std::endl;
+    for (int level = 0; level < as.size(); ++level) {
+      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
+	std::cout << GridLogMessage 
+		  << as[level].actions.at(actionID)->action_name()
+		  <<"["<<level<<"]["<< actionID<<"] "
+		  << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
+      }
+    }
+    std::cout << GridLogMessage << "--------------------------- "<<std::endl;
+    std::cout << GridLogMessage << " Force average size "<<std::endl;
+    std::cout << GridLogMessage << "------------------------- "<<std::endl;
+    for (int level = 0; level < as.size(); ++level) {
+      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
+	std::cout << GridLogMessage 
+		  << as[level].actions.at(actionID)->action_name()
+		  <<"["<<level<<"]["<< actionID<<"] : "
+		  <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
+		  <<" norm "      << as[level].actions.at(actionID)->deriv_norm_average()
+		  <<" calls "     << as[level].actions.at(actionID)->deriv_num
+		  << std::endl;
+      }
+    }
+    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
+  }
+  
  void print_parameters()
  {
    std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
@@ -224,7 +310,6 @@ public:
      }
    }
    std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
-
  }

  void reverse_momenta()
@@ -249,15 +334,19 @@ public:
  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
    assert(P.Grid() == U.Grid());
-    std::cout << GridLogIntegrator << "Integrator refresh\n";
+    std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;

+    std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
    FieldImplementation::generate_momenta(P, sRNG, pRNG);

    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
    // of the Metropolis
+    std::cout << GridLogIntegrator << "Updating smeared fields" << std::endl;
    Smearer.set_Field(U);
    // Set the (eventual) representations gauge fields
+
+    std::cout << GridLogIntegrator << "Updating representations" << std::endl;
    Representations.update(U);

    // The Smearer is attached to a pointer of the gauge field
@@ -267,15 +356,19 @@ public:
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
+	auto name = as[level].actions.at(actionID)->action_name();
+        std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
+
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+	as[level].actions.at(actionID)->refresh_timer_start();
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
+	as[level].actions.at(actionID)->refresh_timer_stop();
      }

      // Refresh the higher representation actions
      as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
    }

-    MomFilter->applyFilter(P);
  }

  // to be used by the actionlevel class to iterate
@@ -310,7 +403,9 @@ public:
        // based on the boolean is_smeared in actionID
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
+	        as[level].actions.at(actionID)->S_timer_start();
        Hterm = as[level].actions.at(actionID)->S(Us);
+   	        as[level].actions.at(actionID)->S_timer_stop();
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
        H += Hterm;
      }
@@ -88,6 +88,12 @@ namespace PeriodicBC {
    return CovShiftBackward(Link,mu,arg);
  }

+  //Boundary-aware C-shift of gauge links / gauge transformation matrices
+  template<class gauge> Lattice<gauge>
+  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
+  {
+    return Cshift(Link, mu, shift);
+  }

 }

@@ -158,6 +164,9 @@ namespace ConjugateBC {
    //    std::cout<<"Gparity::CovCshiftBackward mu="<<mu<<std::endl;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
+
+  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
+  //       = U^T_\mu(L-1)  | x_\mu == 0
  template<class gauge> Lattice<gauge>
  CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu) {
    GridBase *grid = Link.Grid();
@@ -176,6 +185,9 @@ namespace ConjugateBC {
    return Link;
  }

+  //Out(x) = S_\mu(x+\hat\mu)  | x_\mu != L-1
+  //       = S*_\mu(0)  | x_\mu == L-1
+  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  template<class gauge> Lattice<gauge>
  ShiftStaple(const Lattice<gauge> &Link, int mu)
  {
@@ -208,6 +220,35 @@ namespace ConjugateBC {
    return CovShiftBackward(Link,mu,arg);
  }

+  //Boundary-aware C-shift of gauge links / gauge transformation matrices
+  //shift = 1
+  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
+  //       = U*_\mu(0)  | x_\mu == L-1
+  //shift = -1
+  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
+  //       = U*_\mu(L-1)  | x_\mu == 0
+  template<class gauge> Lattice<gauge>
+  CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
+  {
+    GridBase *grid = Link.Grid();
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    Lattice<gauge> tmp(grid);
+    if(shift == 1){
+      tmp = Cshift(Link, mu, 1);
+      tmp = where(coor == Lmu, conjugate(tmp), tmp);
+      return tmp;
+    }else if(shift == -1){
+      tmp = Link;
+      tmp = where(coor == Lmu, conjugate(tmp), tmp);
+      return Cshift(tmp, mu, -1);
+    }else assert(0 && "Invalid shift value");
+    return tmp; //shuts up the compiler fussing about the return type
+  }
+
 }


--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	7319d4e1ad	Merge pull request #407 from giltirn/feature/dirichlet-gparity-stage Import round 4	2022-06-22 15:23:36 -04:00
Christopher Kelly	fd933420c6	Imported changes from feature/gparity_HMC branch: Added a bounds-check function for the RHMC with arbitrary power Added a pseudofermion action for the rational ratio with an arbitrary power and a mixed-precision variant of the same. The existing one-flavor rational ratio class now uses the general class under the hood To support testing of the two-flavor even-odd ratio pseudofermion, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field Added a new HMC runner start type: CheckpointStartReseed, which reseeds the RNG from scratch, allowing for the creation of new evolution streams from an existing checkpoint. Added log output of seeds used when the RNG is seeded. EOFA changes: To support mixed-precision inversion, generalized the class to maintain a separate solver for the L and R operators in the heatbath (separate solvers are already implemented for the other stages) To support mixed-precision, the action of setting the operator shift coefficients is now maintained in a virtual function. A derived class for mixed-precision solvers ensures the coefficients are applied to both the double and single-prec operators The \|\|^2 of the random source is now stored by the heatbath and compared to the initial action when it is computed. These should be equal but may differ if the rational bounds are not chosen correctly, hence serving as a useful and free test Fixed calculation of M_eofa (previously incomplete and #if'd out) Added functionality to compute M_eofa^-1 to complement the calculation of M_eofa (both are equally expensive!) To support testing, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field Added a test program which computes the G-parity force using the 1 and 2 flavor implementations and compares the result. Test supports DWF, EOFA and DSDR actions, chosen by a command line option. The Mobius EOFA force test now also checks the rational approximation used for the heatbath Added a test program for the mixed precision EOFA compared to the double-prec implementation, G-parity HMC test now applied GPBC in the y direction and not the t direction (GPBC in t are no longer supported) and checkpoints after every configuration Added a test program which computes the two-flavor G-parity action (via RHMC) with both the 1 and 2 flavor implementations and checks they agree Added a test program to check the implementation of M_eofa^{-1}	2022-06-22 10:27:48 -04:00
Peter Boyle	9a9f4a111f	Merge pull request #405 from giltirn/feature/dirichlet-gparity-stage Import round 3	2022-06-06 18:45:37 -04:00
Christopher Kelly	1ad54d049d	To PeriodicBC and ConjugateBC, added a new function "CshiftLink" which performs a boundary-aware C-shift of links or products of links. For the latter, the links crossing the global boundary are complex-conjugated. To the gauge implementations, added CshiftLink functions calling into the appropriate operation for the BC in a given direction. GaugeTransform, FourierAcceleratedGaugeFixer and WilsonLoops::FieldStrength no longer implicitly assume periodic boundary conditions; instead the shifted link is obtained using CshiftLink and is aware of the gauge implementation. Added an assert-check to ensure that the gauge fixing converges within the specified number of steps. Added functionality to compute the timeslice averaged plaquette Added functionality to compute the 5LI topological charge and timeslice topological charge Added a check of the properties of the charge conjugation matrix C=-gamma_2 gamma_4 to Test_gamma Fixed const correctness for Replicate Modified Test_fft_gfix to support either conjugate or periodic BCs, optionally disabling Fourier-accelerated gauge fixing, and tuning of alpha using cmdline options	2022-06-02 15:30:41 -04:00
Peter Boyle	d3496d2fe0	Merge pull request #397 from giltirn/feature/dirichlet-gparity-stage Gparity HMC import round 2	2022-05-25 13:29:45 -04:00
Christopher Kelly	6121397587	Imported changes from feature/gparity_HMC branch: Added storage of final true residual in mixed-prec CG and enhanced log output Fixed const correctness of multi-shift constructor Added a mixed precision variant of the multi-shift algorithm that uses a single precision operator and applies periodic reliable update to the residual Added tests/solver/Test_dwf_multishift_mixedprec to test the above Fixed local coherence lanczos using the (large!) max approx to the chebyshev eval as the scale from which to judge the quality of convergence, resulting a test that always passes Added a method to local coherence lanczos class that returns the fine eval/evec pair Added iterative log output to power method Added optional disabling of the plaquette check in Nerscio to support loading old G-parity configs which have a factor of 2 error in the plaquette G-parity Dirac op no longer allows GPBC in the time direction; instead we toggle between periodic and antiperiodic Replaced thread_for G-parity 5D force insertion implementation with accelerator_for version capable of running on GPUs Generalized tests/lanczos/Test_dwf_lanczos to support regular DWF as well as Gparity, with the action chosen by a command line option Modified tests/forces/Test_dwf_gpforce,Test_gpdwf_force,Test_gpwilson_force to use GPBC a spatial direction rather than the t-direction, and antiperiodic BCs for time direction tests/core/Test_gparity now supports using APBC in time direction using command line toggle	2022-05-09 16:27:57 -04:00
Peter Boyle	0417b96896	Merge pull request #391 from giltirn/feature/dirichlet-gparity-stage First stage of import	2022-05-03 08:50:18 -04:00
Christopher Kelly	81fe4c937e	Hopefully fix link errors on Intel compilers due to having no function body for MomentumFilterBase::apply_phase	2022-04-12 09:51:59 -04:00
Christopher Kelly	f77f3a6598	Imported G-parity flavor algebra + tester from feature/gparity_HMC branch	2022-04-06 10:21:04 -04:00
Peter Boyle	239afb18fb	Merge branch 'feature/dirichlet' into feature/dirichlet-gparity	2022-04-05 16:49:32 -04:00
Peter Boyle	ef820a26cd	Bcopy on crusher compile	2022-04-05 16:49:02 -04:00
Peter Boyle	65abe4d0d3	Merge branch 'feature/dirichlet' into feature/dirichlet-gparity	2022-04-05 16:26:54 -04:00
Peter Boyle	5012adfebf	Merge branch 'develop' into feature/dirichlet	2022-04-05 16:26:19 -04:00
Peter Boyle	b808d48fa1	Tone down printing in integrator	2022-04-05 16:25:22 -04:00
Peter Boyle	83f818a99d	Updates for DDHMC	2022-04-05 16:24:34 -04:00
Peter Boyle	387397374a	Current run options	2022-03-23 16:35:11 -04:00
Peter Boyle	bb5c16b97f	New scripts	2022-03-03 17:00:37 -05:00
Peter Boyle	0d80eeb545	small DDHMC update	2022-03-03 16:56:02 -05:00
Peter Boyle	b0f4eee78b	New files	2022-03-01 19:09:13 -05:00
Peter Boyle	5340e50427	HMC running with new formulation	2022-03-01 17:10:25 -05:00
Peter Boyle	0f1c5b08a1	Dirichlet filters running on AMD and now integrated in Fermion op	2022-02-23 19:29:28 -05:00
Peter Boyle	70988e43d2	Passes multinode dirichlet test with boundaries at node boundary or at the single rank boundary	2022-02-23 01:42:14 -05:00
Peter Boyle	aab3bcb46f	Dirichlet first cut - wrong answers on dagger multiply. Struggling to get a compute node so changing systems	2022-02-22 19:58:33 +00:00
Peter Boyle	da06d15f73	Merge branch 'feature/feature/staggered-comms' into develop	2022-02-17 04:58:50 +00:00
Peter Boyle	e8b1251b8c	Staggered fix finished	2022-02-17 04:51:13 +00:00
Peter Boyle	fad5a74a4b	Bug fix to detection case	2022-02-15 10:27:39 -05:00
Peter Boyle	e83f6a6ae9	Merge branch 'develop' into feature/feature/staggered-comms	2022-02-15 08:52:39 -05:00
Azusa Yamaguchi	6283d11d50	Add the comment line to tell the existance of copied data/buffer	2022-02-08 15:22:06 +00:00
Peter Boyle	6616d5d090	Commit	2022-02-02 16:38:24 -05:00