Update to static data

Update to memory manager, never have a Cpu Open in the LRU queue. Place as evict next on CPU closure.
2026-06-19 02:13:42 +01:00 · 2021-12-07 23:41:27 +00:00 · 2021-12-07 17:26:22 -05:00
179 changed files with 2493 additions and 14963 deletions
@@ -44,22 +44,14 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-#endif
+
 //Eigen only
 #endif
 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
@@ -36,7 +36,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
@@ -16,7 +16,6 @@
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
@@ -14,11 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
 #endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
@@ -54,7 +54,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -120,9 +120,6 @@ public:
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      GridStopWatch IterationTimer;
      IterationTimer.Start();
      c = cp;
      MatrixTimer.Start();
@@ -155,14 +152,8 @@ public:
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
-      IterationTimer.Stop();
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
      if ( (k % 500) == 0 ) {
 	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
      } else { 
 	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
 		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
      }
      // Stopping condition
      if (cp <= rsq) {
@@ -179,13 +170,13 @@ public:
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;
-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -49,7 +49,6 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -69,7 +68,6 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@@ -99,7 +97,6 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@@ -133,7 +130,6 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -154,7 +150,6 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
@@ -44,7 +44,7 @@ public:
  using OperatorFunction<Field>::operator();
-  //  RealD   Tolerance;
+  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -182,9 +182,6 @@ public:
    for(int s=0;s<nshift;s++) {
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
@@ -324,8 +321,8 @@ public:
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
      IterationsToComplete = k;	
@@ -1,409 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -113,43 +113,7 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-
+};
  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
    int Nevec = (int)evec_coarse.size();
    int Nsrc = (int)src.size();
    // make temp variables
    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
    //Preporcessing
    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    guess_coarse[j] = Zero();
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockProject(src_coarse[j],src[j],subspace);
    }
    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
    for (int i=0;i<Nevec;i++)
    {
      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
      const CoarseField & tmp = evec_coarse[i];
      for (int j=0;j<Nsrc;j++)
      {
        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
      }
    }
    //postprocessing
    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockPromote(guess_coarse[j],guess[j],subspace);
    guess[j].Checkerboard() = src[j].Checkerboard();
    }
  };
  };
@@ -44,7 +44,6 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -146,24 +145,16 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3,
+					   RealD coarse_relax_tol=5.0e3) 
 					   int largestEvalIdxForReport=-1) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
+      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -186,26 +177,12 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
      RealD tmp_eval;
      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
    }
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
-
+  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -224,13 +201,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
-    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
+
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -308,10 +285,6 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -355,8 +328,6 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -405,31 +376,25 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
+    Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -440,14 +405,6 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
@@ -29,8 +29,6 @@ template<class Field> class PowerMethod
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
@@ -113,6 +113,11 @@ private:
  static uint64_t     DeviceToHostBytes;
  static uint64_t     HostToDeviceXfer;
  static uint64_t     DeviceToHostXfer;
  static uint64_t     DeviceAccesses;
  static uint64_t     HostAccesses;
  static uint64_t     DeviceAccessBytes;
  static uint64_t     HostAccessBytes;
 private:
 #ifndef GRID_UVM
@@ -152,6 +157,7 @@ private:
  //  static void  LRUupdate(AcceleratorViewEntry &AccCache);
  static void  LRUinsert(AcceleratorViewEntry &AccCache);
  static void  LRUinsertback(AcceleratorViewEntry &AccCache);
  static void  LRUremove(AcceleratorViewEntry &AccCache);
  // manage entries in the table
@@ -23,6 +23,11 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 uint64_t  MemoryManager::DeviceAccesses;
 uint64_t  MemoryManager::HostAccesses;
 uint64_t  MemoryManager::DeviceAccessBytes;
 uint64_t  MemoryManager::HostAccessBytes;
 ////////////////////////////////////
 // Priority ordering for unlocked entries
@@ -86,6 +91,14 @@ void  MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
  AccCache.LRU_valid = 1;
  DeviceLRUBytes+=AccCache.bytes;
 }
 void  MemoryManager::LRUinsertback(AcceleratorViewEntry &AccCache)
 {
  assert(AccCache.LRU_valid==0);
  LRU.push_back(AccCache.CpuPtr);
  AccCache.LRU_entry = --LRU.end();
  AccCache.LRU_valid = 1;
  DeviceLRUBytes+=AccCache.bytes;
 }
 void  MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
 {
  assert(AccCache.LRU_valid==1);
@@ -129,6 +142,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
@@ -231,6 +245,9 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
    EntryCreate(CpuPtr,bytes,mode,hint);
  }
  DeviceAccesses++;
  DeviceAccessBytes+=bytes;
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
  if (!AccCache.AccPtr) {
@@ -349,6 +366,10 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
  assert(AccCache.accLock==0);
  AccCache.cpuLock--;
  if(AccCache.cpuLock==0) {
    LRUinsertback(AccCache);
  }
 }
 /*
 *  Action  State   StateNext         Flush    Clone
@@ -371,6 +392,9 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
    EntryCreate(CpuPtr,bytes,mode,transient);
  }
  HostAccesses++;
  HostAccessBytes+=bytes;
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
@@ -416,6 +440,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
  AccCache.transient= transient? EvictNext : 0;
  // If view is opened on host remove from LRU
  // Host close says evict next from device
  if(AccCache.LRU_valid==1){
    LRUremove(AccCache);
  }
  return AccCache.CpuPtr;
 }
 void  MemoryManager::NotifyDeletion(void *_ptr)
@@ -12,6 +12,10 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 uint64_t  MemoryManager::DeviceAccesses;
 uint64_t  MemoryManager::HostAccesses;
 uint64_t  MemoryManager::DeviceAccessBytes;
 uint64_t  MemoryManager::HostAccessBytes;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
@@ -53,11 +53,10 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  int              _processor;       // linear processor rank
  unsigned long    _ndimension;
  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  Coordinate _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@@ -98,9 +97,8 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
-  int                ProcessorCount(void)    ;
+  int                      ProcessorCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@@ -144,16 +142,16 @@ public:
 		      int bytes);
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,int do_xmit,
+			       int xmit_to_rank,
 			       void *recv,
-			       int recv_from_rank,int do_recv,
+			       int recv_from_rank,
 			       int bytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,int do_xmit,
+				    int xmit_to_rank,
 				    void *recv,
-				    int recv_from_rank,int do_recv,
+				    int recv_from_rank,
 				    int bytes,int dir);
@@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@@ -124,13 +124,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-  Coordinate shm_processors       (_ndimension,1);
+
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
    shm_processors       [pad+d]=parent._shm_processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -155,7 +154,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@@ -337,22 +335,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest, int dox,
+						     int dest,
 						     void *recv,
-						     int from, int dor,
+						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,int dox,
+							 int dest,
 							 void *recv,
-							 int from,int dor,
+							 int from,
 							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size();
@@ -372,33 +370,30 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;
-  if ( dor ) {
+  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
-    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+from*32;
-      tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+    assert(ierr==0);
-      assert(ierr==0);
+    list.push_back(rrq);
-      list.push_back(rrq);
+    off_node_bytes+=bytes;
      off_node_bytes+=bytes;
    }
  }
-  
+
-  if (dox) {
+  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
+    tag= dir+_processor*32;
-      tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+    assert(ierr==0);
-      assert(ierr==0);
+    list.push_back(xrq);
-      list.push_back(xrq);
+    off_node_bytes+=bytes;
-      off_node_bytes+=bytes;
+  } else {
-    } else {
+    // TODO : make a OMP loop on CPU, call threaded bcopy
-      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
+    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
+    assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
-    }
+    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
  }
-  
+
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
    this->StencilSendToRecvFromComplete(list,dir);
    list.resize(0);
  }
  return off_node_bytes;
@@ -45,14 +45,12 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@@ -113,18 +111,18 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,int dox,
+						     int xmit_to_rank,
 						     void *recv,
-						     int recv_from_rank,int dor,
+						     int recv_from_rank,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,int dox,
+							 int xmit_to_rank,
 							 void *recv,
-							 int recv_from_rank,int dor,
+							 int recv_from_rank,
 							 int bytes, int dir)
 {
  return 2.0*bytes;
@@ -93,10 +93,9 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
+  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
@@ -152,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -165,8 +165,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
 static inline int divides(int a,int b)
 {
@@ -221,7 +221,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
    dim=(dim+1) %ndimension;
  }
 }
-void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -294,8 +294,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
-  SHM = ShmDims;
+
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -342,7 +341,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@@ -354,8 +353,6 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
  SHM=ShmDims;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -48,10 +48,9 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
  SHM = Coordinate(processors.size(),1);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
@@ -46,4 +46,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
@@ -1,55 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
  typedef typename vobj::tensor_reduced normtype;
  typedef typename normtype::scalar_object scalar;
  std::vector<scalar> sff;
  sliceSum(ff,sff,mu);
  for(int t=0;t<sff.size();t++){
    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
  }
 }
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
@@ -125,12 +125,6 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj>
 typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
  typename vobj::scalar_object s;
  peekSite(s,l,site);
  return s;
 }        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
@@ -142,15 +142,6 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
@@ -168,22 +159,6 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
  return ssum;
 }
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deterministic Reduction operations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -232,7 +207,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  // Might make all code paths go this way.
 #if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
@@ -242,31 +216,15 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
+    accelerator_for( ss, sites, 1,{
-	auto x_l = left_v(ss);
+	auto x_l = left_v[ss];
-	auto y_l = right_v(ss);
+	auto y_l = right_v[ss];
-	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
+	inner_tmp_v[ss]=innerProductD(x_l,y_l);
    });
  }
 #else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, nsimd,{
 	auto x_l = left_v(ss);
 	auto y_l = right_v(ss);
 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
 #endif
  // This is in single precision and fails some tests
-  auto anrm = sumD(inner_tmp_v,sites);  
+  auto anrm = sum(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@@ -300,7 +258,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(x,y);
  typedef typename vobj::scalar_type scalar_type;
-  //  typedef typename vobj::vector_typeD vector_type;
+  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x.Grid();
@@ -312,29 +270,17 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-#if 0
+
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-  accelerator_for( ss, sites, nsimd,{
+  accelerator_for( ss, sites, 1,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
+      auto tmp = a*x_v[ss]+b*y_v[ss];
-      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
+      inner_tmp_v[ss]=innerProductD(tmp,tmp);
-      coalescedWrite(z_v[ss],tmp);
+      z_v[ss]=tmp;
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 #endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 template <class Iterator>
-int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  int device;
 #ifdef GRID_CUDA
@@ -37,13 +37,13 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  /*  
+  
  std::cout << GridLogDebug << "GPU has:" << std::endl;
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  */  
+  
  if (warpSize != WARP_SIZE) {
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
    exit(EXIT_FAILURE);
@@ -53,12 +53,12 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
-    return 0;
+    exit(EXIT_FAILURE);
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
-  return 1;
+  
 }
 template <class sobj, class Iterator>
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_objectD sobj;
  typedef decltype(lat) Iterator;
@@ -207,9 +207,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
-  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  assert(ok);
  Integer smemSize = numThreads * sizeof(sobj);
  Vector<sobj> buffer(numBlocks);
@@ -220,54 +218,6 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  auto result = buffer_v[0];
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,1,{
 	buf[ss] = dat[ss*words+w];
      });
    ret_p[w] = sumD_gpu_small(tbuf,osites);
  }
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  Integer nsimd= vobj::Nsimd();
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  if ( ok ) {
    ret = sumD_gpu_small(lat,osites);
  } else {
    ret = sumD_gpu_large(lat,osites);
  }
  return ret;
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -280,13 +230,6 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
  return result;
 }
-template <class vobj>
+
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
@@ -424,32 +424,9 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
 #if 1
    thread_for( lidx, _grid->lSites(), {
 	int gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
 	Coordinate pcoor;
 	Coordinate lcoor;
 	Coordinate gcoor;
 	_grid->LocalIndexToLocalCoor(lidx,lcoor);
 	pcoor=_grid->ThisProcessorCoor();
 	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	assert(rank == _grid->ThisRank() );
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
    });
 #else
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
 	// Where is it?
 	int rank;
 	int o_idx;
@@ -466,7 +443,6 @@ public:
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
 #endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
@@ -855,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@@ -65,34 +65,29 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
-  GridLogError.Active(1);
+  GridLogError.Active(0);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogMemory.Active(0); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
+    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
+    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
@@ -182,8 +182,6 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern GridLogger GridLogMemory;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
@@ -31,7 +31,6 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include <map>
 #include <pwd.h>
@@ -655,8 +654,7 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    const std::string stNC = std::to_string( Nc ) ;
+    ildgfmt.field     = std::string("su3gauge");
    ildgfmt.field          = std::string("su"+stNC+"gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -873,8 +871,7 @@ class IldgReader : public GridLimeReader {
    } else { 
      assert(found_ildgFormat);
-      const std::string stNC = std::to_string( Nc ) ;
+      assert ( ildgFormat_.field == std::string("su3gauge") );
      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -882,7 +879,7 @@ class IldgReader : public GridLimeReader {
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
@@ -6,8 +6,8 @@
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@@ -203,24 +203,20 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  assert( Nc < 4 && Nc > 1 ) ;
+  const int x=0;
  const int y=1;
  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    #if Nc == 2
+    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
+    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
+    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #else
      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #endif
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -282,6 +278,7 @@ struct GaugeSimpleMunger{
 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -320,8 +317,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
+      for(int i=0;i<2;i++){
-	for(int j=0;j<Nc;j++){
+	for(int j=0;j<3;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -333,8 +330,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
+      for(int i=0;i<2;i++){
-	for(int j=0;j<Nc;j++){
+	for(int j=0;j<3;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
@@ -9,7 +9,6 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,8 +30,6 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <string>
 NAMESPACE_BEGIN(Grid);
 using namespace Grid;
@@ -42,10 +39,8 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
-  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
+  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
@@ -150,17 +145,15 @@ public:
    std::string format(header.floating_point);
-    const int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32big = (format == std::string("IEEE32BIG"));
-    const int ieee32    = (format == std::string("IEEE32"));
+    int ieee32    = (format == std::string("IEEE32"));
-    const int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
-    const int ieee64    = (format == std::string("IEEE64") || \
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
 			   format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    const std::string stNC = std::to_string( Nc ) ;
+    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -171,7 +164,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
+    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -205,7 +198,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
@@ -216,29 +209,27 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"),
+					std::string ens_label = std::string("DWF"))
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
+    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"),
+					std::string ens_label = std::string("DWF"))
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
-    header.sequence_number = sequence_number;
+    ///////////////////////////////////////////
-    header.ensemble_id     = ens_id;
+    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
    header.hdr_version     = "1.0" ;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -252,14 +243,10 @@ public:
    uint64_t offset;
-    // Sod it -- always write NcxNc double
+    // Sod it -- always write 3x3 double
-    header.floating_point  = std::string("IEEE64BIG");
+    header.floating_point = std::string("IEEE64BIG");
-    const std::string stNC = std::to_string( Nc ) ;
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
-    if( two_row ) {
+    GaugeSimpleUnmunger<fobj3D,sobj> munge;
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
    } else {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -267,15 +254,8 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    if( two_row ) {
+    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-      Gauge3x2unmunger<fobj2D,sobj> munge;
+					      nersc_csum,scidac_csuma,scidac_csumb);
      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    } else {
      GaugeSimpleUnmunger<fobj3D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -307,7 +287,8 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);
-    uint64_t offset;
+	uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@@ -347,7 +328,7 @@ public:
    GridBase *grid = parallel.Grid();
-    uint64_t offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);
    FieldMetaData clone(header);
@@ -72,9 +72,17 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
@@ -39,9 +39,9 @@ NAMESPACE_BEGIN(Grid)
 // C++11 time facilities better?
 inline double usecond(void) {
  struct timeval tv;
-  tv.tv_sec = 0;
+#ifdef TIMERS_ON
  tv.tv_usec = 0;
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
@@ -16,12 +16,8 @@
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
 #endif
 #include "pugixml.h"
@@ -63,7 +63,6 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -88,8 +87,6 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -113,10 +110,8 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -181,16 +176,6 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -235,16 +220,6 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@@ -40,29 +40,6 @@ class Action
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
  }
  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return deriv_us; };
  RealD refresh_timer(void)      { return deriv_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
@@ -37,10 +37,6 @@ NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
 NAMESPACE_CHECK(ActionParams);
 #include <Grid/qcd/action/filters/MomentumFilter.h>
 #include <Grid/qcd/action/filters/DirichletFilter.h>
 #include <Grid/qcd/action/filters/DDHMCFilter.h>
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
@@ -37,33 +37,24 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
  Coordinate twists;
-                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
+  GparityWilsonImplParams() : twists(Nd, 0) {};
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {};
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    dirichlet.resize(Nd,0);
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    dirichlet.resize(Nd,0);
  }
 };
 struct StaggeredImplParams {
-  Coordinate dirichlet; // Blocksize of dirichlet BCs
+  StaggeredImplParams()  {};
  StaggeredImplParams()
  {
    dirichlet.resize(Nd,0);
  };
 };
  struct OneFlavourRationalParams : Serializable {
@@ -72,11 +63,9 @@ struct StaggeredImplParams {
 				    RealD, hi, 
 				    int,   MaxIter, 
 				    RealD, tolerance, 
 				    RealD, mdtolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq,
+				    int,   BoundsCheckFreq);
 				    RealD, BoundsCheckTol);
  // MaxIter and tolerance, vectors??
@@ -87,62 +76,16 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20,
+				int _BoundsCheckFreq=20)
 				RealD mdtol    = 1.0e-6,
 				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
        mdtolerance(mdtol),
 	degree(_degree),
        precision(_precision),
-        BoundsCheckFreq(_BoundsCheckFreq),
+        BoundsCheckFreq(_BoundsCheckFreq){};
        BoundsCheckTol(_BoundsCheckTol){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
 #endif
@@ -68,17 +68,9 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
+  RealD Mass(void) { return mass; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
-    mass_plus=mass_minus=_mass; 
+    mass=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
    mass_plus=_mass_plus;
    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@@ -116,7 +108,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
  //    protected:
-  RealD mass_plus, mass_minus;
+  RealD mass;
  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
@@ -1,435 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 ////////////////////////////////////////////
 // Standard Clover
 //   (4+m0) + csw * clover_term
 // Exp Clover
 //   (4+m0) * exp(csw/(4+m0) clover_term)
 //   = (4+m0) + csw * clover_term + ...
 ////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////
 // Generic Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
    GridBase *grid = CloverTerm.Grid();
    CloverTerm += diag_mass;
    int lvol = grid->lSites();
    int DimRep = Impl::Dimension;
    {
      autoView(CTv,CloverTerm,CpuRead);
      autoView(CTIv,CloverTermInv,CpuWrite);
      thread_for(site, lvol, {
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
        peekLocalSite(Qx, CTv, lcoor);
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++){
                auto zz =  Qx()(j, k)(a, b);
                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
              }
        EigenInvCloverOp = EigenCloverOp.inverse();
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++)
                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
               pokeLocalSite(Qxinv, CTIv, lcoor);
      });
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Generic Exp Clover
 //////////////////////////////////
 template<class Impl>
 class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef WilsonCloverHelpers<Impl> Helpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 //////////////////////////////////
 // Compact Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
                            public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    Clover += diag_mass;
  }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
                          CloverTriangleField& Triangle,
                          RealD csw_t, RealD diag_mass) {
    // Do nothing
  }
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Compact Exp Clover
 //////////////////////////////////
 template<class Impl>
 class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    // do nothing!
    // mass term is multiplied to exp(Clover) below
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
  	  typedef iMatrix<ComplexD,6> mat;
  	  RealD qn[6];
  	  RealD qnold[6];
  	  RealD p[5];
  	  RealD trA2, trA3, trA4;
  	  mat A2, A3, A4, A5;
  	  A2 = alpha * alpha * arg * arg;
  	  A3 = alpha * arg * A2;
  	  A4 = A2 * A2;
  	  A5 = A2 * A3;
  	  trA2 = toReal( trace(A2) );
  	  trA3 = toReal( trace(A3) );
  	  trA4 = toReal( trace(A4));
  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
  	  p[3] = trA3 / 3.0;
  	  p[4] = 0.5 * trA2;
  	  qnold[0] = cN[Niter];
  	  qnold[1] = 0.0;
  	  qnold[2] = 0.0;
  	  qnold[3] = 0.0;
  	  qnold[4] = 0.0;
  	  qnold[5] = 0.0;
  	  for(int i = Niter-1; i >= 0; i--)
  	  {
  	   qn[0] = p[0] * qnold[5] + cN[i];
  	   qn[1] = p[1] * qnold[5] + qnold[0];
  	   qn[2] = p[2] * qnold[5] + qnold[1];
  	   qn[3] = p[3] * qnold[5] + qnold[2];
  	   qn[4] = p[4] * qnold[5] + qnold[3];
  	   qn[5] = qnold[4];
  	   qnold[0] = qn[0];
  	   qnold[1] = qn[1];
  	   qnold[2] = qn[2];
  	   qnold[3] = qn[3];
  	   qnold[4] = qn[4];
  	   qnold[5] = qn[5];
  	  }
  	  mat unit(1.0);
  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
    }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Diagonal.Grid();
    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
    //
    // Implementation completely in Daniel's layout
    //
    // Taylor expansion with Cayley-Hamilton recursion
    // underlying Horner scheme as above
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++){
      cn[i] = cn[i-1] / RealD(i);
    }
      // Taken over from Daniel's implementation
      conformable(Diagonal, Triangle);
      long lsites = grid->lSites();
    {
      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
      typedef iMatrix<ComplexD,6> mat;
      autoView(diagonal_v,  Diagonal,  CpuRead);
      autoView(triangle_v,  Triangle,  CpuRead);
      autoView(diagonalExp_v, Diagonal, CpuWrite);
      autoView(triangleExp_v, Triangle, CpuWrite);
      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
    	  mat srcCloverOpUL(0.0); // upper left block
    	  mat srcCloverOpLR(0.0); // lower right block
    	  mat ExpCloverOp;
        scalar_object_diagonal diagonal_tmp     = Zero();
        scalar_object_diagonal diagonal_exp_tmp = Zero();
        scalar_object_triangle triangle_tmp     = Zero();
        scalar_object_triangle triangle_exp_tmp = Zero();
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
        peekLocalSite(triangle_tmp, triangle_v, lcoor);
        int block;
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
        		if (i == j){
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
        		}
        		else{
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
        		}
        	}
        }
        block = 1;
        for(int i = 0; i < 6; i++){
          	for(int j = 0; j < 6; j++){
           		if (i == j){
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
           		}
           		else{
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
           		}
            }
        }
        // exp(Clover)
        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
            	if (i == j){
            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
            	}
            	else if(i < j){
            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
            	}
           	}
        }
        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 1;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
              	if (i == j){
              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
               	}
               	else if(i < j){
               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
               	}
            }
        }
        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
      });
    }
    Diagonal *= diag_mass;
    Triangle *= diag_mass;
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 NAMESPACE_END(Grid);
@@ -1,241 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
    Copyright (C) 2020 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer <nils.meyer@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 // see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
 //
 // Modifications done here:
 //
 // Original: clover term = 12x12 matrix per site
 //
 // But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
 // Sufficient to store/transfer only the real parts of the diagonal and one triangular part
 // 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
 //
 // Here: Above but diagonal as complex numbers, i.e., need to store/transfer
 // 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
 //
 // Words per site and improvement compared to original (combined with the input and output spinors):
 //
 // - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
 // - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
 // - Here:     2*12 + 42    =  66 words -> 2.55 x less
 //
 // These improvements directly translate to wall-clock time
 //
 // Data layout:
 //
 // - diagonal and triangle part as separate lattice fields,
 //   this was faster than as 1 combined field on all tested machines
 // - diagonal: as expected
 // - triangle: store upper right triangle in row major order
 // - graphical:
 //        0  1  2  3  4
 //           5  6  7  8
 //              9 10 11 = upper right triangle indices
 //                12 13
 //                   14
 //     0
 //        1
 //           2
 //              3       = diagonal indices
 //                 4
 //                    5
 //     0
 //     1  5
 //     2  6  9          = lower left triangle indices
 //     3  7 10 12
 //     4  8 11 13 14
 //
 // Impact on total memory consumption:
 // - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
 // - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
 template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
  /////////////////////////////////////////////
  // Sizes
  /////////////////////////////////////////////
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  /////////////////////////////////////////////
  // Type definitions
  /////////////////////////////////////////////
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonFermion<Impl>              WilsonBase;
  typedef WilsonCloverHelpers<Impl>        Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  /////////////////////////////////////////////
  // Constructors
  /////////////////////////////////////////////
 public:
  CompactWilsonCloverFermion(GaugeField& _Umu,
 			    GridCartesian& Fgrid,
 			    GridRedBlackCartesian& Hgrid,
 			    const RealD _mass,
 			    const RealD _csw_r = 0.0,
 			    const RealD _csw_t = 0.0,
 			    const RealD _cF = 1.0,
 			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
 			    const ImplParams& impl_p = ImplParams());
  /////////////////////////////////////////////
  // Member functions (implementing interface)
  /////////////////////////////////////////////
 public:
  virtual void Instantiatable() {};
  int          ConstEE()     override { return 0; };
  int          isTrivialEE() override { return 0; };
  void Dhop(const FermionField& in, FermionField& out, int dag) override;
  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
  void M(const FermionField& in, FermionField& out) override;
  void Mdag(const FermionField& in, FermionField& out) override;
  void Meooe(const FermionField& in, FermionField& out) override;
  void MeooeDag(const FermionField& in, FermionField& out) override;
  void Mooee(const FermionField& in, FermionField& out) override;
  void MooeeDag(const FermionField& in, FermionField& out) override;
  void MooeeInv(const FermionField& in, FermionField& out) override;
  void MooeeInvDag(const FermionField& in, FermionField& out) override;
  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  /////////////////////////////////////////////
  // Member functions (internals)
  /////////////////////////////////////////////
  void MooeeInternal(const FermionField&        in,
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle);
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
  void ImportGauge(const GaugeField& _Umu) override;
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
 private:
  template<class Field>
  const MaskField* getCorrectMaskField(const Field &in) const {
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        return &this->BoundaryMaskOdd;
      } else {
        return &this->BoundaryMaskEven;
      }
    } else {
      return &this->BoundaryMask;
    }
  }
  template<class Field>
  void ApplyBoundaryMask(Field& f) {
    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
    assert(m != nullptr);
    CompactHelpers::ApplyBoundaryMask(f, *m);
  }
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
 public:
  RealD csw_r;
  RealD csw_t;
  RealD cF;
  bool open_boundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
  FermionField Tmp;
  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
 };
 NAMESPACE_END(Grid);
@@ -53,7 +53,6 @@ NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -138,52 +137,21 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
-template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
-template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
-typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
-typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
-typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
-typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
 typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
 typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
 typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
@@ -49,8 +49,6 @@ public:
  virtual FermionField &tmp(void) = 0;
  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
@@ -30,18 +30,6 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -125,7 +113,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -151,10 +139,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    //If this site is an global boundary site, perform the G-parity flavor twist
+    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
-    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
+
      if ( sl == 2 ) {
-	//Only do the twist for lanes on the edge of the physical node
+       
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@@ -209,19 +197,6 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -232,19 +207,14 @@ public:
    GaugeLinkField Uconj(GaugeGrid);
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-
+        
-    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
+    for(int mu=0;mu<Nd;mu++){
-    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
+          
-    for(int mu=0;mu<Nd-1;mu++){
+      LatticeCoordinate(coor,mu);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -259,7 +229,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	});
+	  });
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -290,38 +260,6 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -360,48 +298,28 @@ public:
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
    assert(0);
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls=Btilde.Grid()->_fdimensions[0];
    {
      GridBase *GaugeGrid = mat.Grid();
      Lattice<iScalar<vInteger> > coor(GaugeGrid);
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      autoView( mat_v , mat, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls = Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
      autoView( tmp_v , tmp, CpuWrite);
      autoView( Atilde_v , Atilde, CpuRead);
      autoView( Btilde_v , Btilde, CpuRead);
      thread_for(ss,tmp.Grid()->oSites(),{
 	  for (int s = 0; s < Ls; s++) {
 	    int sF = s + Ls * ss;
 	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
 	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
 	  }
 	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
@@ -4,11 +4,10 @@
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
-    Copyright (C) 2017 - 2022
+    Copyright (C) 2017
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,9 +29,7 @@
 #pragma once
-#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@@ -52,16 +49,19 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
-template<class Impl, class CloverHelpers>
+template <class Impl>
-class WilsonCloverFermion : public WilsonFermion<Impl>,
+class WilsonCloverFermion : public WilsonFermion<Impl>
                            public WilsonCloverHelpers<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
+  template <typename vtype>
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
-  typedef WilsonFermion<Impl>       WilsonBase;
+public:
-  typedef WilsonCloverHelpers<Impl> Helpers;
+  typedef WilsonFermion<Impl> WilsonBase;
  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@@ -72,7 +72,42 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams());
+                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@@ -89,21 +124,250 @@ public:
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
  {
    conformable(X.Grid(), Y.Grid());
    conformable(X.Grid(), force.Grid());
    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
    GaugeField clover_force(force.Grid());
    PropagatorField Lambda(force.Grid());
-public:
+    // Guido: Here we are hitting some performance issues:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
    Impl::extractLinkField(U, this->Umu);
    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 protected:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverField CloverTerm, CloverTermInv;                     // Clover term
+  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
-  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
+  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
-  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
+  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
-  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
+  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
-  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
+  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
 };
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
      T_v[i]()(2, 3) = -F_v[i]()();
      T_v[i]()(3, 2) = F_v[i]()();
    });
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
      T_v[i]()(2, 3) = (F_v[i]()());
      T_v[i]()(3, 2) = -(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
 };
 NAMESPACE_END(Grid);
@@ -1,763 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 // Helper routines that implement common clover functionality
 NAMESPACE_BEGIN(Grid);
 template<class Impl> class WilsonCloverHelpers {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
  static CloverField fillCloverYZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
    });
    return T;
  }
  static CloverField fillCloverXY(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverYT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverZT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  template<class _Spinor>
  static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
    auto CC = coalescedRead(C);
    mult(&phi, &CC, &chi);
  }
  template<class _SpinorField>
  inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
    const int Nsimd = SiteSpinor::Nsimd();
    autoView(out_v, out, AcceleratorWrite);
    autoView(phi_v, phi, AcceleratorRead);
    autoView(C_v,   C,   AcceleratorRead);
    typedef decltype(coalescedRead(out_v[0])) calcSpinor;
    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
      calcSpinor tmp;
      multClover(tmp,C_v[sss],phi_v(sss));
      coalescedWrite(out_v[sss],tmp);
    });
  }
 };
 ////////////////////////////////////////////////////////
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  #if 0
  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #else
  template<typename vobj>
  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #endif
  static accelerator_inline int triangle_index(int i, int j) {
    if(i == j)
      return 0;
    else if(i < j)
      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
    else // i > j
      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
  }
  static void MooeeKernel_gpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(in_v,       in,       AcceleratorRead);
    autoView(out_v,      out,      AcceleratorWrite);
    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
    const uint64_t NN = Nsite * Ls;
    accelerator_for(ss, NN, Simd::Nsimd(), {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v(sF);
      auto diagonal_t = diagonal_v(sU);
      auto triangle_t = triangle_v(sU);
      for(int block=0; block<Nhs; block++) {
        int s_start = block*Nhs;
        for(int i=0; i<Nred; i++) {
          int si = s_start + i/Nc, ci = i%Nc;
          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
          for(int j=0; j<Nred; j++) {
            if (j == i) continue;
            int sj = s_start + j/Nc, cj = j%Nc;
            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
          };
        };
      };
      coalescedWrite(out_v[sF], res);
    });
  }
  static void MooeeKernel_cpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, CpuRead);
    autoView(triangle_v, triangle, CpuRead);
    autoView(in_v,       in,       CpuRead);
    autoView(out_v,      out,      CpuWrite);
    typedef SiteSpinor CalcSpinor;
 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
 #define PREFETCH_CLOVER(BASE) {                                     \
    uint64_t base;                                                  \
    int pf_dist_L1 = 1;                                             \
    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
                                                                    \
    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
    }                                                               \
                                                                    \
    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
    }                                                               \
  }
 // TODO: Implement/generalize this for other architectures
 // I played around a bit on KNL (see below) but didn't bring anything
 // #elif defined(AVX512)
 // #define PREFETCH_CLOVER(BASE) {                              \
 //     uint64_t base;                                           \
 //     int pf_dist_L1 = 1;                                      \
 //     int pf_dist_L2 = +4;                                     \
 //                                                              \
 //     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
 //     }                                                        \
 //                                                              \
 //     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
 //     }                                                        \
 //   }
 #else
 #define PREFETCH_CLOVER(BASE)
 #endif
    const uint64_t NN = Nsite * Ls;
    thread_for(ss, NN, {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v[sF];
      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
      auto triangle_t = triangle_v[sU];
      // upper half
      PREFETCH_CLOVER(0);
      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
      auto in_cc_1_0 = conjugate(in_t()(1)(0));
      auto in_cc_1_1 = conjugate(in_t()(1)(1));
      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
                  +           triangle_t()(0)( 0) * in_t()(0)(1)
                  +           triangle_t()(0)( 1) * in_t()(0)(2)
                  +           triangle_t()(0)( 2) * in_t()(1)(0)
                  +           triangle_t()(0)( 3) * in_t()(1)(1)
                  +           triangle_t()(0)( 4) * in_t()(1)(2);
      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
                  +           triangle_t()(0)( 5) * in_t()(0)(2)
                  +           triangle_t()(0)( 6) * in_t()(1)(0)
                  +           triangle_t()(0)( 7) * in_t()(1)(1)
                  +           triangle_t()(0)( 8) * in_t()(1)(2)
                  + conjugate(       res()(0)( 1));
      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
                  +           triangle_t()(0)( 5) * in_cc_0_1;
      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
                  +           triangle_t()(0)( 9) * in_t()(1)(0)
                  +           triangle_t()(0)(10) * in_t()(1)(1)
                  +           triangle_t()(0)(11) * in_t()(1)(2)
                  + conjugate(       res()(0)( 2));
      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
                  +           triangle_t()(0)( 6) * in_cc_0_1
                  +           triangle_t()(0)( 9) * in_cc_0_2;
      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
                  +           triangle_t()(0)(12) * in_t()(1)(1)
                  +           triangle_t()(0)(13) * in_t()(1)(2)
                  + conjugate(       res()(1)( 0));
      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
                  +           triangle_t()(0)( 7) * in_cc_0_1
                  +           triangle_t()(0)(10) * in_cc_0_2
                  +           triangle_t()(0)(12) * in_cc_1_0;
      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
                  +           triangle_t()(0)(14) * in_t()(1)(2)
                  + conjugate(       res()(1)( 1));
      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
                  +           triangle_t()(0)( 8) * in_cc_0_1
                  +           triangle_t()(0)(11) * in_cc_0_2
                  +           triangle_t()(0)(13) * in_cc_1_0
                  +           triangle_t()(0)(14) * in_cc_1_1;
      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
                  + conjugate(       res()(1)( 2));
      vstream(out_v[sF]()(0)(0), res()(0)(0));
      vstream(out_v[sF]()(0)(1), res()(0)(1));
      vstream(out_v[sF]()(0)(2), res()(0)(2));
      vstream(out_v[sF]()(1)(0), res()(1)(0));
      vstream(out_v[sF]()(1)(1), res()(1)(1));
      vstream(out_v[sF]()(1)(2), res()(1)(2));
      // lower half
      PREFETCH_CLOVER(1);
      auto in_cc_2_0 = conjugate(in_t()(2)(0));
      auto in_cc_2_1 = conjugate(in_t()(2)(1));
      auto in_cc_2_2 = conjugate(in_t()(2)(2));
      auto in_cc_3_0 = conjugate(in_t()(3)(0));
      auto in_cc_3_1 = conjugate(in_t()(3)(1));
      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
                  +           triangle_t()(1)( 0) * in_t()(2)(1)
                  +           triangle_t()(1)( 1) * in_t()(2)(2)
                  +           triangle_t()(1)( 2) * in_t()(3)(0)
                  +           triangle_t()(1)( 3) * in_t()(3)(1)
                  +           triangle_t()(1)( 4) * in_t()(3)(2);
      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
                  +           triangle_t()(1)( 5) * in_t()(2)(2)
                  +           triangle_t()(1)( 6) * in_t()(3)(0)
                  +           triangle_t()(1)( 7) * in_t()(3)(1)
                  +           triangle_t()(1)( 8) * in_t()(3)(2)
                  + conjugate(       res()(2)( 1));
      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
                  +           triangle_t()(1)( 5) * in_cc_2_1;
      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
                  +           triangle_t()(1)( 9) * in_t()(3)(0)
                  +           triangle_t()(1)(10) * in_t()(3)(1)
                  +           triangle_t()(1)(11) * in_t()(3)(2)
                  + conjugate(       res()(2)( 2));
      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
                  +           triangle_t()(1)( 6) * in_cc_2_1
                  +           triangle_t()(1)( 9) * in_cc_2_2;
      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
                  +           triangle_t()(1)(12) * in_t()(3)(1)
                  +           triangle_t()(1)(13) * in_t()(3)(2)
                  + conjugate(       res()(3)( 0));
      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
                  +           triangle_t()(1)( 7) * in_cc_2_1
                  +           triangle_t()(1)(10) * in_cc_2_2
                  +           triangle_t()(1)(12) * in_cc_3_0;
      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
                  +           triangle_t()(1)(14) * in_t()(3)(2)
                  + conjugate(       res()(3)( 1));
      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
                  +           triangle_t()(1)( 8) * in_cc_2_1
                  +           triangle_t()(1)(11) * in_cc_2_2
                  +           triangle_t()(1)(13) * in_cc_3_0
                  +           triangle_t()(1)(14) * in_cc_3_1;
      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
                  + conjugate(       res()(3)( 2));
      vstream(out_v[sF]()(2)(0), res()(2)(0));
      vstream(out_v[sF]()(2)(1), res()(2)(1));
      vstream(out_v[sF]()(2)(2), res()(2)(2));
      vstream(out_v[sF]()(3)(0), res()(3)(0));
      vstream(out_v[sF]()(3)(1), res()(3)(1));
      vstream(out_v[sF]()(3)(2), res()(3)(2));
    });
  }
  static void MooeeKernel(int                        Nsite,
                          int                        Ls,
                          const FermionField&        in,
                          FermionField&              out,
                          const CloverDiagonalField& diagonal,
                          const CloverTriangleField& triangle) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)
    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
 #else
    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
 #endif
  }
  static void Invert(const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle,
                     CloverDiagonalField&       diagonalInv,
                     CloverTriangleField&       triangleInv) {
    conformable(diagonal, diagonalInv);
    conformable(triangle, triangleInv);
    conformable(diagonal, triangle);
    diagonalInv.Checkerboard() = diagonal.Checkerboard();
    triangleInv.Checkerboard() = triangle.Checkerboard();
    GridBase* grid = diagonal.Grid();
    long lsites = grid->lSites();
    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
    autoView(diagonal_v,  diagonal,  CpuRead);
    autoView(triangle_v,  triangle,  CpuRead);
    autoView(diagonalInv_v, diagonalInv, CpuWrite);
    autoView(triangleInv_v, triangleInv, CpuWrite);
    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      scalar_object_diagonal diagonal_tmp     = Zero();
      scalar_object_diagonal diagonal_inv_tmp = Zero();
      scalar_object_triangle triangle_tmp     = Zero();
      scalar_object_triangle triangle_inv_tmp = Zero();
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
      peekLocalSite(triangle_tmp, triangle_v, lcoor);
      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
              else
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
            }
          }
        }
      }
      clover_inv_eigen = clover_eigen.inverse();
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else if(i < j)
                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else
                continue;
            }
          }
        }
      }
      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
    });
  }
  static void ConvertLayout(const CloverField&   full,
                            CloverDiagonalField& diagonal,
                            CloverTriangleField& triangle) {
    conformable(full, diagonal);
    conformable(full, triangle);
    diagonal.Checkerboard() = full.Checkerboard();
    triangle.Checkerboard() = full.Checkerboard();
    autoView(full_v,     full,     AcceleratorRead);
    autoView(diagonal_v, diagonal, AcceleratorWrite);
    autoView(triangle_v, triangle, AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else if(i < j)
                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else
                continue;
            }
          }
        }
      }
    });
  }
  static void ConvertLayout(const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverField&               full) {
    conformable(full, diagonal);
    conformable(full, triangle);
    full.Checkerboard() = diagonal.Checkerboard();
    full = Zero();
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(full_v,     full,     AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
              else
                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
            }
          }
        }
      }
    });
  }
  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
    // Checks/grid
    double t0 = usecond();
    conformable(diagonal, triangle);
    GridBase* grid = diagonal.Grid();
    // Determine the boundary coordinates/sites
    double t1 = usecond();
    int t_dir = Nd - 1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    // Set off-diagonal parts at boundary to zero -- OK
    double t2 = usecond();
    CloverTriangleField zeroTriangle(grid);
    zeroTriangle.Checkerboard() = triangle.Checkerboard();
    zeroTriangle = Zero();
    triangle = where(t_coor == 0,   zeroTriangle, triangle);
    triangle = where(t_coor == T-1, zeroTriangle, triangle);
    // Set diagonal to unity (scaled correctly) -- OK
    double t3 = usecond();
    CloverDiagonalField tmp(grid);
    tmp.Checkerboard() = diagonal.Checkerboard();
    tmp                = -1.0 * csw_t + diag_mass;
    diagonal           = where(t_coor == 0,   tmp, diagonal);
    diagonal           = where(t_coor == T-1, tmp, diagonal);
    // Correct values next to boundary
    double t4 = usecond();
    if(cF != 1.0) {
      tmp = cF - 1.0;
      tmp += diagonal;
      diagonal = where(t_coor == 1,   tmp, diagonal);
      diagonal = where(t_coor == T-2, tmp, diagonal);
    }
    // Report timings
    double t5 = usecond();
 #if 0
    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
              << " checks = "          << (t1 - t0) / 1e6
              << ", coordinate = "     << (t2 - t1) / 1e6
              << ", off-diag zero = "  << (t3 - t2) / 1e6
              << ", diagonal unity = " << (t4 - t3) / 1e6
              << ", near-boundary = "  << (t5 - t4) / 1e6
              << ", total = "          << (t5 - t0) / 1e6
              << std::endl;
 #endif
  }
  template<class Field, class Mask>
  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
    conformable(f, m);
    auto grid  = f.Grid();
    const uint32_t Nsite = grid->oSites();
    const uint32_t Nsimd = grid->Nsimd();
    autoView(f_v, f, AcceleratorWrite);
    autoView(m_v, m, AcceleratorRead);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, Nsite, Nsimd, {
      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
    });
  }
  template<class MaskField>
  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
    assert(!full.Grid()->_isCheckerBoarded);
    GridBase* grid = full.Grid();
    int t_dir = Nd-1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    MaskField zeroMask(grid); zeroMask = Zero();
    full = 1.0;
    full = where(t_coor == 0,   zeroMask, full);
    full = where(t_coor == T-1, zeroMask, full);
    pickCheckerboard(Even, even, full);
    pickCheckerboard(Odd,  odd,  full);
  }
 };
 NAMESPACE_END(Grid);
@@ -1,90 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 class WilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteClover;
  typedef Lattice<SiteClover> CloverField;
 };
 template<class Impl>
 class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
  typedef iSinglet<Simd>            SiteMask;
  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
  typedef Lattice<SiteMask>           MaskField;
 };
 #define INHERIT_CLOVER_TYPES(Impl)                                 \
  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
 #define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
  /* ugly duplication but needed inside functionality classes */ \
  template<typename vtype> using iImplCloverDiagonal = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
  template<typename vtype> using iImplCloverTriangle = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
 #define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
 NAMESPACE_END(Grid);
@@ -297,7 +297,7 @@ public:
  void ZeroCountersi(void)  {  }
  void Reporti(int calls)  {  }
-  //  Vector<int> surface_list;
+  std::vector<int> surface_list;
  WilsonStencil(GridBase *grid,
 		int npoints,
@@ -307,11 +307,10 @@ public:
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
    ZeroCountersi();
-    //    surface_list.resize(0);
+    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
  /*
  void BuildSurfaceList(int Ls,int vol4){
    // find same node for SHM
@@ -332,8 +331,7 @@ public:
      }
    }
  }
-  */
+
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
@@ -75,10 +75,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  int Dirichlet;
  Coordinate Block; 
  /********** Deprecate timers **********/
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
@@ -177,10 +173,7 @@ public:
 		  GridCartesian         &FourDimGrid,
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
-
+    
  virtual void DirichletBlock(const Coordinate & block)
  {
  }
  // Constructors
  /*
    WilsonFermion5D(int simd, 
@@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass_plus(_mass), mass_minus(_mass)
+  mass(_mass)
 { 
 }
@@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass_minus*cee[Ls-1];
+      lower[s] = mass*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass_plus*cee[0];
+      upper[s] = mass*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass_plus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_minus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
@@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass_minus*cs[Ls-1];
+      lower[s] =-mass*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass_plus*cs[0];
+      upper[s] =-mass*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      leem[i]=mass_minus*cee[Ls-1]/bee[0];
+      leem[i]=mass*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      ueem[i]=mass_plus;
+      ueem[i]=mass;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
@@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
  { 
-    Coeff_t delta_d=mass_minus*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@@ -642,10 +642,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -781,8 +777,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
@@ -66,17 +66,18 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  M5Dcalls++;
  M5Dtime-=usecond();
-  uint64_t nloop = grid->oSites();
+  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t s = sss%Ls;
+    uint64_t ss= sss*Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1, tmp2;
-    uint64_t idx_u = ss+((s+1)%Ls);
+    for(int s=0;s<Ls;s++){
-    uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      uint64_t idx_u = ss+((s+1)%Ls);
-    spProj5m(tmp1,psi(idx_u));
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
-    spProj5p(tmp2,psi(idx_l));
+      spProj5m(tmp1,psi(idx_u));
-    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+      spProj5p(tmp2,psi(idx_l));
      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
@@ -107,17 +108,18 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  M5Dcalls++;
  M5Dtime-=usecond();
-  uint64_t nloop = grid->oSites();
+  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t s = sss%Ls;
+    uint64_t ss=sss*Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1,tmp2;
-    uint64_t idx_u = ss+((s+1)%Ls);
+    for(int s=0;s<Ls;s++){
-    uint64_t idx_l = ss+((s+Ls-1)%Ls);
+      uint64_t idx_u = ss+((s+1)%Ls);
-    spProj5p(tmp1,psi(idx_u));
+      uint64_t idx_l = ss+((s+Ls-1)%Ls);
-    spProj5m(tmp2,psi(idx_l));
+      spProj5p(tmp1,psi(idx_u));
-    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
+      spProj5m(tmp2,psi(idx_l));
      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
@@ -1,373 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
                                                                            GridCartesian& Fgrid,
                                                                            GridRedBlackCartesian& Hgrid,
                                                                            const RealD _mass,
                                                                            const RealD _csw_r,
                                                                            const RealD _csw_t,
                                                                            const RealD _cF,
                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
                                                                            const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
  , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
  , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
  , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
  , Tmp(&Fgrid)
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
  if (open_boundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
    } else {
      MooeeInternal(in, out, DiagonalEven, TriangleEven);
    }
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
    } else {
      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
    }
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
  if(open_boundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!open_boundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
        continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  out.Checkerboard() = in.Checkerboard();
  conformable(in, out);
  conformable(in, diagonal);
  conformable(in, triangle);
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
  double t0 = usecond();
  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
  // Initialize temporary variables
  double t1 = usecond();
  conformable(_Umu.Grid(), this->GaugeGrid());
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  double t3 = usecond();
  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
  // Handle mass term based on clover policy
  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
  // Convert the data layout of the clover term
  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Exponentiate the clover (nothing happens in case of the standard clover)
  double t5 = usecond();
  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
  // Possible modify the boundary values
  double t6 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
  double t7 = usecond();
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
  // Fill the remaining clover fields
  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
  double t9 = usecond();
  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }
 NAMESPACE_END(Grid);
@@ -2,13 +2,12 @@
    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-    Copyright (C) 2017 - 2022
+    Copyright (C) 2017
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -34,48 +33,9 @@
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
                                               const RealD                         _csw_r,
                                               const RealD                         _csw_t,
                                               const WilsonAnisotropyCoefficients& clover_anisotropy,
                                               const ImplParams&                   impl_p)
  : WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , CloverTerm(&Fgrid)
  , CloverTermInv(&Fgrid)
  , CloverTermEven(&Hgrid)
  , CloverTermOdd(&Hgrid)
  , CloverTermInvEven(&Hgrid)
  , CloverTermInvOdd(&Hgrid)
  , CloverTermDagEven(&Hgrid)
  , CloverTermDagOdd(&Hgrid)
  , CloverTermInvDagEven(&Hgrid)
  , CloverTermInvDagOdd(&Hgrid) {
  assert(Nd == 4); // require 4 dimensions
  if(clover_anisotropy.isAnisotropic) {
    csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
    diag_mass = 4.0 + _mass;
  }
  csw_t = _csw_t * 0.5;
  if(csw_r == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
  if(csw_t == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
  ImportGauge(_Umu);
 }
 // *NOT* EO
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -89,8 +49,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, Fermion
  out += temp;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -104,16 +64,13 @@ void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, Ferm
  out += temp;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
  double t1 = usecond();
  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  double t2 = usecond();
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@@ -122,20 +79,52 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  double t3 = usecond();
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = Helpers::fillCloverYZ(Bx) * csw_r;
+  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
+  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
+  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
+  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
+  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
+  CloverTerm += fillCloverZT(Ez) * csw_t;
-   
+  CloverTerm += diag_mass;
-  double t4 = usecond();
+
-  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
+  int lvol = _Umu.Grid()->lSites();
  int DimRep = Impl::Dimension;
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    thread_for(site, lvol, {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++){
 	      auto zz =  Qx()(j, k)(a, b);
 	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 	    }
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
      EigenInvCloverOp = EigenCloverOp.inverse();
      //std::cout << EigenInvCloverOp << std::endl;
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
    });
  }
  double t5 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -148,47 +137,37 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
  double t6 = usecond();
  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
-  CloverField *Clover;
+  CloverFieldType *Clover;
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  if (dag)
@@ -203,12 +182,12 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
-      Helpers::multCloverField(out, *Clover, in);
+      out = *Clover * in;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
+      out = adj(*Clover) * in;
    }
  }
  else
@@ -226,109 +205,29 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
-      Helpers::multCloverField(out, *Clover, in);
+      out = *Clover * in;
      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      Helpers::multCloverField(out, *Clover, in);
+      out = *Clover * in;
    }
  }
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
 template<class Impl, class CloverHelpers>
 void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
      continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 // Derivative parts
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
@@ -60,8 +60,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid),
+  _tmp(&FiveDimRedBlackGrid)
  Dirichlet(0)
 {
  // some assertions
  assert(FiveDimGrid._ndimension==5);
@@ -92,19 +91,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
  }
  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
      Dirichlet = 1;
      Block = block;
    }
  } else {
    Coordinate block(Nd+1,0);
    Block = block;
  }
  ZeroCounters();
  if (Impl::LsVectorised) { 
    int nsimd = Simd::Nsimd();
@@ -232,14 +218,6 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
    std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
    Coordinate GaugeBlock(Nd);
    for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
    std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
    DirichletFilter<GaugeField> Filter(GaugeBlock);
    Filter.applyFilter(HUmu);
  }
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
@@ -4,13 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2022
+Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -600,47 +599,11 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
+  assert(0);
  PropagatorField tmp_shifted(UGrid);
  PropagatorField g5Lg5(UGrid);
  PropagatorField R(UGrid);
  PropagatorField gmuR(UGrid);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  g5Lg5=g5*q_in_1*g5;
  tmp_shifted=Cshift(q_in_2,mu,1);
  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
  gmuR=gmu*R;
  q_out=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
  tmp_shifted=Cshift(q_in_1,mu,1);
  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
  g5Lg5=g5*g5Lg5*g5;
  R=q_in_2;
  gmuR=gmu*R;
  q_out-=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
 }
@@ -654,51 +617,9 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
+  assert(0);
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  PropagatorField L(UGrid);
  PropagatorField zz (UGrid);
  zz=Zero();
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  tmp = Cshift(q_in,mu,1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
  tmp = q_in *lattice_cmplx;
  tmp = Cshift(tmp,mu,-1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
  tmp = -( Utmp + gmu*Utmp );
  // Mask the time
  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
    unsigned int t0 = 0;
    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
  } else {
    tmp = where((lcoor>=tmin+tshift),tmp,zz);
  }
  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }
 NAMESPACE_END(Grid);
@@ -440,17 +440,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
 #define KERNEL_CALL_EXT(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
    });									\
  accelerator_barrier();
 #define ASM_CALL(A)							\
  thread_for( ss, Nsite, {						\
    int sU = ss;							\
@@ -1,44 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -8,8 +8,7 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-    Author: Mattia Bruno <mattia.bruno@cern.ch>
+
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@@ -32,12 +31,10 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
+template class WilsonCloverFermion<IMPLEMENTATION>; 
 template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
@@ -18,10 +18,6 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "
 COMPACT_WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD "
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@@ -50,17 +46,7 @@ for impl in $WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done
 CC_LIST="CompactWilsonCloverFermionInstantiation"
 for impl in $COMPACT_WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
@@ -77,14 +63,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done
 # overwrite the .cc file in Gparity directories
 for impl in $GDWF_IMPL_LIST
 do
-  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
+  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc 
 done
@@ -98,7 +84,7 @@ for impl in $STAG_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
 done
 done
@@ -1,102 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // DDHMC filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 template<typename GaugeField>
 struct DDHMCFilter: public MomentumFilterBase<GaugeField>
 {
  Coordinate Block;
  int Width;
  DDHMCFilter(const Coordinate &_Block,int _Width=2): Block(_Block) { Width=_Width; }
  void applyFilter(GaugeField &U) const override
  {
    GridBase *grid = U.Grid();
    Coordinate Global=grid->GlobalDimensions();
    GaugeField zzz(grid); zzz = Zero();
    LatticeInteger coor(grid); 
    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
    ////////////////////////////////////////////////////
    // Zero BDY layers
    ////////////////////////////////////////////////////
    std::cout<<GridLogMessage<<" DDHMC Force Filter Block "<<Block<<" width " <<Width<<std::endl;
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	LatticeCoordinate(coor,mu);
 	////////////////////////////////
 	// OmegaBar - zero all links contained in slice B-1,0 and
 	// mu links connecting to Omega
 	////////////////////////////////
 	if ( Width==1) { 
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
 	if ( Width==2) { 
 	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
 	if ( Width==3) { 
 	  U    = where(mod(coor,B1)==Integer(B1-3),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-2),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	  U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	  U    = where(mod(coor,B1)==Integer(2)   ,zzz,U); 
 	  auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	  U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); 
 	  PokeIndex<LorentzIndex>(U, U_mu, mu);
 	}
      }
    }
  }
 };
 NAMESPACE_END(Grid);
@@ -1,71 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct DirichletFilter: public MomentumFilterBase<MomentaField>
 {
  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
  typedef iScalar<iScalar<iScalar<vector_type> > >            ScalarType; //complex phase for each site
  Coordinate Block;
  DirichletFilter(const Coordinate &_Block): Block(_Block){}
  void applyFilter(MomentaField &P) const override
  {
    GridBase *grid = P.Grid();
    typedef decltype(PeekIndex<LorentzIndex>(P, 0)) LatCM;
    ////////////////////////////////////////////////////
    // Zero strictly links crossing between domains
    ////////////////////////////////////////////////////
    LatticeInteger coor(grid); 
    LatCM zz(grid); zz = Zero();
    for(int mu=0;mu<Nd;mu++) {
      if ( (Block[mu]) && (Block[mu] <= grid->GlobalDimensions()[mu] ) ) {
 	// If costly could provide Grid earlier and precompute masks
 	std::cout << GridLogMessage << " Dirichlet in mu="<<mu<<std::endl;
 	LatticeCoordinate(coor,mu);
 	auto P_mu = PeekIndex<LorentzIndex>(P, mu);
 	P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
 	PokeIndex<LorentzIndex>(P, P_mu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
@@ -69,11 +69,6 @@ public:
    return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Same as Cshift for periodic BCs
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline bool isPeriodicGaugeField(void) { return true; }
 };
@@ -115,11 +110,6 @@ public:
      return PeriodicBC::CovShiftBackward(Link, mu, field);
  }
  //If mu is a conjugate BC direction
  //Out(x) = U^dag_\mu(x-mu)  | x_\mu != 0
  //       = U^T_\mu(L-1)  | x_\mu == 0
  //else
  //Out(x) = U^dag_\mu(x-mu mod L)
  static inline GaugeLinkField
  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
  {
@@ -139,13 +129,6 @@ public:
      return PeriodicBC::CovShiftIdentityForward(Link,mu);
  }
  //If mu is a conjugate BC direction
  //Out(x) = S_\mu(x+mu)  | x_\mu != L-1
  //       = S*_\mu(x+mu)  | x_\mu == L-1
  //else
  //Out(x) = S_\mu(x+mu mod L)
  //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
  {
    assert(_conjDirs.size() == Nd);
@@ -155,27 +138,6 @@ public:
      return PeriodicBC::ShiftStaple(Link,mu);
  }
  //Boundary-aware C-shift of gauge links / gauge transformation matrices
  //For conjugate BC direction
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu)  | x_\mu != L-1
  //       = U*_\mu(0)  | x_\mu == L-1
  //shift = -1
  //Out(x) = U_\mu(x-mu)  | x_\mu != 0
  //       = U*_\mu(L-1)  | x_\mu == 0
  //else
  //shift = 1
  //Out(x) = U_\mu(x+\hat\mu mod L)
  //shift = -1
  //Out(x) = U_\mu(x-\hat\mu mod L)
  static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
    assert(_conjDirs.size() == Nd);
    if(_conjDirs[mu]) 
      return ConjugateBC::CshiftLink(Link,mu,shift);
    else     
      return PeriodicBC::CshiftLink(Link,mu,shift);
  }
  static inline void       setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
  static inline std::vector<int> getDirections(void) { return _conjDirs; }
  static inline bool isPeriodicGaugeField(void) { return false; }
@@ -13,31 +13,6 @@ NAMESPACE_BEGIN(Grid);
      std::cout << GridLogMessage << "Pseudofermion action lamda_max "<<lambda_max<<"( bound "<<hi<<")"<<std::endl;
      assert( (lambda_max < hi) && " High Bounds Check on operator failed" );
    }
     template<class Field> void ChebyBoundsCheck(LinearOperatorBase<Field> &HermOp,
 						 Field &GaussNoise,
 						 RealD lo,RealD hi) 
    {
      int orderfilter = 1000;
      Chebyshev<Field> Cheb(lo,hi,orderfilter);
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Z(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      Cheb(HermOp,X,Z);
      RealD Nz = norm2(Z);
      std::cout << "************************* "<<std::endl;
      std::cout << " noise                    = "<<Nx<<std::endl;
      std::cout << " Cheb x noise             = "<<Nz<<std::endl;
      std::cout << " Ratio                    = "<<Nz/Nx<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( ((Nz/Nx)<1.0) && " ChebyBoundsCheck ");
    }
    template<class Field> void InverseSqrtBoundsCheck(int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
@@ -65,65 +40,13 @@ NAMESPACE_BEGIN(Grid);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
-      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
+      std::cout << " noise                         = "<<Nx<<std::endl;
-      std::cout << " | (MdagM^-1/2)^2  noise |^2         = "<<Nz<<std::endl;
+      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
-      std::cout << " | MdagM (MdagM^-1/2)^2  noise |^2   = "<<Ny<<std::endl;
+      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
-      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
+      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }
    /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
       by computing   |X -    HermOp * [ Hermop^{-1/inv_pow} ]^{inv_pow} X|  < tol  
       for noise X (aka GaussNoise).
       ApproxNegPow should be the rational approximation for   X^{-1/inv_pow}
    */
    template<class Field> void InversePowerBoundsCheck(int inv_pow,
 						       int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
 						       Field &GaussNoise,
 						       MultiShiftFunction &ApproxNegPow) 
    {
      GridBase *FermionGrid = GaussNoise.Grid();
      Field X(FermionGrid);
      Field Y(FermionGrid);
      Field Z(FermionGrid);
      Field tmp1(FermionGrid), tmp2(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      ConjugateGradientMultiShift<Field> msCG(MaxIter,ApproxNegPow);
      tmp1 = X;
      Field* in = &tmp1;
      Field* out = &tmp2;
      for(int i=0;i<inv_pow;i++){ //apply  [ Hermop^{-1/inv_pow}  ]^{inv_pow} X =   HermOp^{-1} X
 	msCG(HermOp, *in, *out); //backwards conventions!
 	if(i!=inv_pow-1) std::swap(in, out);
      }
      Z = *out;
      RealD Nz = norm2(Z);
      HermOp.HermOp(Z,Y);
      RealD Ny = norm2(Y);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
      std::cout << " | noise |^2                         = "<<Nx<<std::endl;
      std::cout << " | (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2        = "<<Nz<<std::endl;
      std::cout << " | MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2   = "<<Ny<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
      std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
    }
 NAMESPACE_END(Grid);
@@ -1,163 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundaryBoson.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), 
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_NumOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag P^dag P phi}
    //
    // NumOp == P
    //
    // Take phi = P^{-1} eta  ; eta = P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField eta(NumOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    //DumpSliceNorm("eta",eta);
    NumOp.RInv(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=ActionStoppingCondition;
    NumOp.ImportGauge(U);
    FermionField Y(NumOp.FermionGrid());
    NumOp.R(Phi,Y);
    RealD action = norm2(Y);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.tolinner=InnerStoppingCondition;
    NumOp.tol=DerivativeStoppingCondition;
    NumOp.ImportGauge(U);
    GridBase *fgrid = NumOp.FermionGrid();
    GridBase *ugrid = NumOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DoidP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // P^dag P term
    NumOp.dOmegaDagInv(P_Phi,DoidP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidP_Phi,tmp);
    X = DobiDdbPhi;
    Y = DobidDddDoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
@@ -1,158 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  RealD InnerStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
  RealD refresh_action;
 public:
  DomainDecomposedBoundaryTwoFlavourPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol = 1.0e-6 )
    : DenOp(_DenOp),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol),
      InnerStoppingCondition(_InnerTol),
      Phi(_DenOp.FermionGrid()) {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag Rdag^-1 R^-1 phi}
    //
    // DenOp == R
    //
    // Take phi = R eta  ; eta = R^-1 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol     =ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField eta(DenOp.FermionGrid());
    gaussian(pRNG,eta);    eta=eta*scale;
    DenOp.ProjectBoundaryBar(eta);
    DenOp.R(eta,Phi);
    //DumpSliceNorm("Phi",Phi);
    refresh_action = norm2(eta);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Rdag^-1 R^-1 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=ActionStoppingCondition;
    DenOp.ImportGauge(U);
    FermionField X(DenOp.FermionGrid());
    DenOp.RInv(Phi,X);
    RealD action = norm2(X);
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol=DerivativeStoppingCondition;
    DenOp.ImportGauge(U);
    GridBase *fgrid = DenOp.FermionGrid();
    GridBase *ugrid = DenOp.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DiDdb_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinv_Phi(fgrid);    // Vector D in my notes
    FermionField Rinv_Phi(fgrid);
 //   FermionField RinvDagRinv_Phi(fgrid);
 //   FermionField DdbdDidRinv_Phi(fgrid);
    // R^-1 term
    DenOp.dBoundaryBar(Phi,tmp);
    DenOp.Dinverse(tmp,DiDdb_Phi);            // Vector C
    Rinv_Phi = Phi - DiDdb_Phi;
    DenOp.ProjectBoundaryBar(Rinv_Phi); 
    // R^-dagger R^-1 term
    DenOp.DinverseDag(Rinv_Phi,DidRinv_Phi); // Vector D
 /*
    DenOp.dBoundaryBarDag(DidRinv_Phi,DdbdDidRinv_Phi);
    RinvDagRinv_Phi = Rinv_Phi - DdbdDidRinv_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinv_Phi);
 */
    X = DiDdb_Phi;
    Y = DidRinv_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    DumpSliceNorm("force",dSdU);
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
@@ -1,237 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class ImplD,class ImplF>
 class DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion : public Action<typename ImplD::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(ImplD);
 private:
  SchurFactoredFermionOperator<ImplD,ImplF> & NumOp;// the basic operator
  SchurFactoredFermionOperator<ImplD,ImplF> & DenOp;// the basic operator
  RealD InnerStoppingCondition;
  RealD ActionStoppingCondition;
  RealD DerivativeStoppingCondition;
  FermionField Phi; // the pseudo fermion field for this trajectory
 public:
  DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion(SchurFactoredFermionOperator<ImplD,ImplF>  &_NumOp, 
 						       SchurFactoredFermionOperator<ImplD,ImplF>  &_DenOp,
 						       RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6)
    : NumOp(_NumOp), DenOp(_DenOp),
      Phi(_NumOp.PeriodicFermOpD.FermionGrid()),
      InnerStoppingCondition(_InnerTol),
      DerivativeStoppingCondition(_DerivativeTol),
      ActionStoppingCondition(_ActionTol)
  {};
  virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField eta(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField tmp(NumOp.PeriodicFermOpD.FermionGrid());
    // P(phi) = e^{- phi^dag P^dag Rdag^-1 R^-1 P phi}
    //
    // NumOp == P
    // DenOp == R
    //
    // Take phi = P^{-1} R eta  ; eta = R^-1 P Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    gaussian(pRNG,eta);    eta=eta*scale;
    NumOp.ProjectBoundaryBar(eta);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    DenOp.R(eta,tmp);
    NumOp.RInv(tmp,Phi);
    DumpSliceNorm("Phi",Phi);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag Pdag Rdag^-1 R^-1 P phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField X(NumOp.PeriodicFermOpD.FermionGrid());
    FermionField Y(NumOp.PeriodicFermOpD.FermionGrid());
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = ActionStoppingCondition;
    NumOp.tol = ActionStoppingCondition;
    NumOp.R(Phi,Y);
    DenOp.RInv(Y,X);
    RealD action = norm2(X);
    //    std::cout << " DD boundary action is " <<action<<std::endl;
    return action;
  };
  virtual void deriv(const GaugeField &U,GaugeField & dSdU)
  {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    GridBase *fgrid = NumOp.PeriodicFermOpD.FermionGrid();
    GridBase *ugrid = NumOp.PeriodicFermOpD.GaugeGrid();
    FermionField  X(fgrid);
    FermionField  Y(fgrid);
    FermionField  tmp(fgrid);
    GaugeField   force(ugrid);	
    FermionField DobiDdbPhi(fgrid);      // Vector A in my notes
    FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes
    FermionField DiDdbP_Phi(fgrid);      // Vector C in my notes
    FermionField DidRinvP_Phi(fgrid);    // Vector D in my notes
    FermionField DdbdDidRinvP_Phi(fgrid);
    FermionField DoidRinvDagRinvP_Phi(fgrid);    // Vector E in my notes
    FermionField DobidDddDoidRinvDagRinvP_Phi(fgrid);    // Vector F in my notes
    FermionField P_Phi(fgrid);
    FermionField RinvP_Phi(fgrid);
    FermionField RinvDagRinvP_Phi(fgrid);
    FermionField PdagRinvDagRinvP_Phi(fgrid);
    //    RealD action = S(U);
    NumOp.tolinner=InnerStoppingCondition;
    DenOp.tolinner=InnerStoppingCondition;
    DenOp.tol = DerivativeStoppingCondition;
    NumOp.tol = DerivativeStoppingCondition;
    // P term
    NumOp.dBoundaryBar(Phi,tmp);
    NumOp.dOmegaBarInv(tmp,DobiDdbPhi);        // Vector A
    NumOp.dBoundary(DobiDdbPhi,tmp);
    NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi);      // Vector B
    P_Phi  = Phi - DoiDdDobiDdbPhi;
    NumOp.ProjectBoundaryBar(P_Phi);
    // R^-1 P term
    DenOp.dBoundaryBar(P_Phi,tmp);
    DenOp.Dinverse(tmp,DiDdbP_Phi);            // Vector C
    RinvP_Phi = P_Phi - DiDdbP_Phi;
    DenOp.ProjectBoundaryBar(RinvP_Phi); // Correct to here
    // R^-dagger R^-1 P term
    DenOp.DinverseDag(RinvP_Phi,DidRinvP_Phi); // Vector D
    DenOp.dBoundaryBarDag(DidRinvP_Phi,DdbdDidRinvP_Phi);
    RinvDagRinvP_Phi = RinvP_Phi - DdbdDidRinvP_Phi;
    DenOp.ProjectBoundaryBar(RinvDagRinvP_Phi);
    // P^dag R^-dagger R^-1 P term
    NumOp.dOmegaDagInv(RinvDagRinvP_Phi,DoidRinvDagRinvP_Phi); // Vector E
    NumOp.dBoundaryDag(DoidRinvDagRinvP_Phi,tmp);
    NumOp.dOmegaBarDagInv(tmp,DobidDddDoidRinvDagRinvP_Phi);   // Vector F
    NumOp.dBoundaryBarDag(DobidDddDoidRinvDagRinvP_Phi,tmp);
    PdagRinvDagRinvP_Phi = RinvDagRinvP_Phi- tmp;
    NumOp.ProjectBoundaryBar(PdagRinvDagRinvP_Phi);
    /*
    std::cout << "S eval  "<< action << std::endl;
    std::cout << "S - IP1 "<< innerProduct(Phi,PdagRinvDagRinvP_Phi) << std::endl;
    std::cout << "S - IP2 "<< norm2(RinvP_Phi) << std::endl;
    NumOp.R(Phi,tmp);
    tmp = tmp - P_Phi;
    std::cout << "diff1 "<<norm2(tmp) <<std::endl;
    DenOp.RInv(P_Phi,tmp);
    tmp = tmp - RinvP_Phi;
    std::cout << "diff2 "<<norm2(tmp) <<std::endl;
    DenOp.RDagInv(RinvP_Phi,tmp);
    tmp  = tmp - RinvDagRinvP_Phi;
    std::cout << "diff3 "<<norm2(tmp) <<std::endl;
    DenOp.RDag(RinvDagRinvP_Phi,tmp);
    tmp  = tmp - PdagRinvDagRinvP_Phi;
    std::cout << "diff4 "<<norm2(tmp) <<std::endl;
    */
    dSdU=Zero();
    X = DobiDdbPhi;
    Y = DobidDddDoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DoiDdDobiDdbPhi;
    Y = DoidRinvDagRinvP_Phi;
    NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    X = DiDdbP_Phi;
    Y = DidRinvP_Phi;
    DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo);    dSdU=dSdU+force;
    DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes);   dSdU=dSdU+force;
    dSdU *= -1.0;
  };
 };
 NAMESPACE_END(Grid);
@@ -44,10 +44,6 @@ NAMESPACE_BEGIN(Grid);
  // Exact one flavour implementation of DWF determinant ratio //
  ///////////////////////////////////////////////////////////////
  //Note: using mixed prec CG for the heatbath solver in this action class will not work
  //      because the L, R operators must have their shift coefficients updated throughout the heatbath step
  //      You will find that the heatbath solver simply won't converge.
  //      To use mixed precision here use the ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction variant below
  template<class Impl>
  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
  {
@@ -61,60 +57,37 @@ NAMESPACE_BEGIN(Grid);
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
-      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBL;
+      SchurRedBlackDiagMooeeSolve<FermionField> SolverHB;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverHBR;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> SolverR;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverL;
      SchurRedBlackDiagMooeeSolve<FermionField> DerivativeSolverR;
      FermionField Phi; // the pseudofermion field for this trajectory
      RealD norm2_eta; //|eta|^2 where eta is the random gaussian field used to generate the pseudofermion field
      bool initial_action; //true for the first call to S after refresh, for which the identity S = |eta|^2 holds provided the rational approx is good
    public:
      //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
      virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
 	AbstractEOFAFermion<Impl>&op = LorR == 0 ? Lop : Rop;
 	op.RefreshShiftCoefficients(to);
      }
      //Use the same solver for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& CG, 
 					      Params& p, 
 					      bool use_fc=false) 
-	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,CG,p,use_fc) {};
+	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,p,use_fc) {};
-
+	
      //Use the same solver for L,R in the heatbath but different solvers elsewhere
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
-					      OperatorFunction<FermionField>& HeatbathCG,
+					      OperatorFunction<FermionField>& HeatbathCG, 
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false)
 	: ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,HeatbathCG,HeatbathCG, ActionCGL, ActionCGR, DerivCGL,DerivCGR,p,use_fc) {};
      //Use different solvers for L,R in all cases
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 					      OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 					      OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 					      Params& p, 
 					      bool use_fc=false) : 
        Lop(_Lop), 
 	Rop(_Rop), 
-	SolverHBL(HeatbathCGL,false,true), SolverHBR(HeatbathCGR,false,true),
+	SolverHB(HeatbathCG,false,true),
 	SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), 
 	DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), 
 	Phi(_Lop.FermionGrid()), 
 	param(p), 
-	use_heatbath_forecasting(use_fc),
+        use_heatbath_forecasting(use_fc)
 	initial_action(false)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);
@@ -124,8 +97,6 @@ NAMESPACE_BEGIN(Grid);
        PowerNegHalf.Init(remez, param.tolerance, true);
      };
      const FermionField &getPhi() const{ return Phi; }
      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
      virtual std::string LogParameters() {
@@ -146,19 +117,6 @@ NAMESPACE_BEGIN(Grid);
        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
        // 
        RealD scale = std::sqrt(0.5);
        FermionField eta    (Lop.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
@@ -166,10 +124,12 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-     void refresh(const GaugeField &U, const FermionField &eta) {
+      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField eta         (Lop.FermionGrid());
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        FermionField Forecast_src(Lop.FermionGrid());
@@ -180,6 +140,11 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
        // Seed with Gaussian noise vector (var = 0.5)
        RealD scale = std::sqrt(0.5);
        gaussian(pRNG,eta);
        eta = eta * scale;
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
@@ -195,15 +160,15 @@ NAMESPACE_BEGIN(Grid);
        tmp[1] = Zero();
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-          heatbathRefreshShiftCoefficients(0, -gamma_l);
+          Lop.RefreshShiftCoefficients(-gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
-            SolverHBL(Lop, CG_src, CG_soln);
+            SolverHB(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero(); // Just use zero as the initial guess
-	    SolverHBL(Lop, CG_src, CG_soln);
+            SolverHB(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
@@ -222,15 +187,15 @@ NAMESPACE_BEGIN(Grid);
        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
-	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]);
+          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
-            SolverHBR(Rop, CG_src, CG_soln);
+            SolverHB(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = Zero();
-            SolverHBR(Rop, CG_src, CG_soln);
+            SolverHB(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
@@ -240,117 +205,49 @@ NAMESPACE_BEGIN(Grid);
        Phi = Phi + tmp[1];
        // Reset shift coefficients for energy and force evals
-	heatbathRefreshShiftCoefficients(0, 0.0);
+        Lop.RefreshShiftCoefficients(0.0);
-	heatbathRefreshShiftCoefficients(1, -1.0);
+        Rop.RefreshShiftCoefficients(-1.0);
 	//Mark that the next call to S is the first after refresh
 	initial_action = true;
 	// Bounds check
 	RealD EtaDagEta = norm2(eta);
 	norm2_eta = EtaDagEta;
 	//	RealD PhiDagMPhi= norm2(eta);
      };
-      void Meofa(const GaugeField& U,const FermionField &in, FermionField & out) 
+      void Meofa(const GaugeField& U,const FermionField &phi, FermionField & Mphi) 
      {
 #if 0
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
-        FermionField spProj_in(Lop.FermionGrid());
+        FermionField spProj_Phi(Lop.FermionGrid());
 	FermionField mPhi(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
-	out = in;
+	mPhi = phi;
        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(in, spProj_in, -1, Lop.Ls);
+        spProj(Phi, spProj_Phi, -1, Lop.Ls);
-        Lop.Omega(spProj_in, tmp[0], -1, 0);
+        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverL(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
-	spProj(tmp[0], tmp[1], -1, Lop.Ls);
+	mPhi = mPhi -  Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
 	out = out -  Lop.k * tmp[1];
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
-        spProj(in, spProj_in, 1, Rop.Ls);
+        spProj(Phi, spProj_Phi, 1, Rop.Ls);
-        Rop.Omega(spProj_in, tmp[0], 1, 0);
+        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = Zero();
        SolverR(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
-	spProj(tmp[0], tmp[1], 1, Rop.Ls);
+        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
-
+#endif
        out = out + Rop.k * tmp[1];
      }
      //Due to the structure of EOFA, it is no more expensive to compute the inverse of Meofa
      //To ensure correctness we can simply reuse the heatbath code but use the rational approx
      //f(x) = 1/x   which corresponds to alpha_0=0,  alpha_1=1,  beta_1=0 => gamma_1=1
      void MeofaInv(const GaugeField &U, const FermionField &in, FermionField &out) {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
 	// = 1 * \eta
        out = in;
        // LH terms:
        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
        spProj(in, tmp[0], -1, Lop.Ls);
        Lop.Omega(tmp[0], tmp[1], -1, 0);
        G5R5(CG_src, tmp[1]);
        {
          heatbathRefreshShiftCoefficients(0, -1.); //-gamma_1 = -1.
 	  CG_soln = Zero(); // Just use zero as the initial guess
 	  SolverHBL(Lop, CG_src, CG_soln);
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = Lop.k * tmp[0];
        }
        Lop.Omega(tmp[1], tmp[0], -1, 1);
        spProj(tmp[0], tmp[1], -1, Lop.Ls);
        out = out + tmp[1];
        // RH terms:
        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
        //          - \beta_l\gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
        spProj(in, tmp[0], 1, Rop.Ls);
        Rop.Omega(tmp[0], tmp[1], 1, 0);
        G5R5(CG_src, tmp[1]);
        {
 	  heatbathRefreshShiftCoefficients(1, 0.); //-gamma_1 * beta_1 = 0
 	  CG_soln = Zero();
 	  SolverHBR(Rop, CG_src, CG_soln);
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = - Rop.k * tmp[0];
        }
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        spProj(tmp[0], tmp[1], 1, Rop.Ls);
        out = out + tmp[1];
        // Reset shift coefficients for energy and force evals
 	heatbathRefreshShiftCoefficients(0, 0.0);
 	heatbathRefreshShiftCoefficients(1, -1.0);
      };
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
@@ -374,7 +271,7 @@ NAMESPACE_BEGIN(Grid);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
-        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi>
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
@@ -384,26 +281,6 @@ NAMESPACE_BEGIN(Grid);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
 	if(initial_action){
 	  //For the first call to S after refresh,  S = |eta|^2. We can use this to ensure the rational approx is good
 	  RealD diff = action - norm2_eta;
 	  //S_init = eta^dag M^{-1/2} M M^{-1/2} eta
 	  //S_init - eta^dag eta =  eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta
 	  //If approximate solution
 	  //S_init - eta^dag eta =  eta^dag ( [M^{-1/2}+\delta M^{-1/2}] M [M^{-1/2}+\delta M^{-1/2}] - 1 ) eta
 	  //               \approx  eta^dag ( \delta M^{-1/2} M^{1/2} + M^{1/2}\delta M^{-1/2} ) eta
 	  // We divide out |eta|^2 to remove source scaling but the tolerance on this check should still be somewhat higher than the actual approx tolerance
 	  RealD test = fabs(diff)/norm2_eta; //test the quality of the rational approx
 	  std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl;
 	  std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << "  expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl;
 	  assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
 	  initial_action = false;
 	}
        return action;
      };
@@ -452,40 +329,6 @@ NAMESPACE_BEGIN(Grid);
      };
  };
  template<class ImplD, class ImplF>
  class ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction : public ExactOneFlavourRatioPseudoFermionAction<ImplD>{
  public:
    INHERIT_IMPL_TYPES(ImplD);
    typedef OneFlavourRationalParams Params;
  private:
    AbstractEOFAFermion<ImplF>& LopF; // the basic LH operator
    AbstractEOFAFermion<ImplF>& RopF; // the basic RH operator
  public:
    virtual std::string action_name() { return "ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction"; }
    //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator
    virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){
      AbstractEOFAFermion<ImplF> &op = LorR == 0 ? LopF : RopF;
      op.RefreshShiftCoefficients(to);
      this->ExactOneFlavourRatioPseudoFermionAction<ImplD>::heatbathRefreshShiftCoefficients(LorR,to);
    }
    ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction(AbstractEOFAFermion<ImplF>& _LopF, 
 							     AbstractEOFAFermion<ImplF>& _RopF,
 							     AbstractEOFAFermion<ImplD>& _LopD, 
 							     AbstractEOFAFermion<ImplD>& _RopD,
 							     OperatorFunction<FermionField>& HeatbathCGL, OperatorFunction<FermionField>& HeatbathCGR,
 							     OperatorFunction<FermionField>& ActionCGL, OperatorFunction<FermionField>& ActionCGR, 
 							     OperatorFunction<FermionField>& DerivCGL , OperatorFunction<FermionField>& DerivCGR, 
 							     Params& p, 
 							     bool use_fc=false) : 
    LopF(_LopF), RopF(_RopF), ExactOneFlavourRatioPseudoFermionAction<ImplD>(_LopD, _RopD, HeatbathCGL, HeatbathCGR, ActionCGL, ActionCGR, DerivCGL, DerivCGR, p, use_fc){}
  };
 NAMESPACE_END(Grid);
 #endif
@@ -1,372 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators
    /////////////////////////////////////////////////////////
    /* S_f = -log( det(  [M^dag M]/[V^dag V] )^{1/inv_pow}  )
           = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\
 	   = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\
 	   = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\
 	   S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
       BIG WARNING:	   
       Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator.
       this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant
       Thus for DWF the numerator operator is the Pauli-Villars operator
       Here P/Q \sim R_{1/(2*inv_pow)}  ~ (V^dagV)^{1/(2*inv_pow)}  
       Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow}  
    */
    template<class Impl>
    class GeneralEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef RationalActionParams Params;
      Params param;
      //For action evaluation
      MultiShiftFunction ApproxPowerAction   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerAction;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerAction;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)}
      //For the MD integration
      MultiShiftFunction ApproxPowerMD   ;  //rational approx for X^{1/inv_pow}
      MultiShiftFunction ApproxNegPowerMD;  //rational approx for X^{-1/inv_pow}
      MultiShiftFunction ApproxHalfPowerMD;   //rational approx for X^{1/(2*inv_pow)}
      MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)}
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
      FermionOperator<Impl> & DenOp;// the basic operator
      FermionField PhiEven; // the pseudo fermion field for this trajectory
      FermionField PhiOdd; // the pseudo fermion field for this trajectory
      //Generate the approximation to x^{1/inv_pow} (->approx)   and x^{-1/inv_pow} (-> approx_inv)  by an approx_degree degree rational approximation
      //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift
      static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){
 	std::cout<<GridLogMessage << "Generating degree "<< approx_degree<<" approximation for x^(1/" << inv_pow << ")"<<std::endl;
 	double error = remez.generateApprox(approx_degree,1,inv_pow);	
 	if(error > CG_tolerance)
 	  std::cout<<GridLogMessage << "WARNING: Remez approximation has a larger error " << error << " than the CG tolerance " << CG_tolerance << "! Try increasing the number of poles" << std::endl;
 	approx.Init(remez, CG_tolerance,false);
 	approx_inv.Init(remez, CG_tolerance,true);
      }
    protected:
      static constexpr bool Numerator = true;
      static constexpr bool Denominator = false;
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector<FermionField> &out_elems, FermionField &out){
 	SchurDifferentiableOperator<Impl> schurOp(numerator ? NumOp : DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG(MaxIter, approx);
 	msCG(schurOp,in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const GaugeField &U){
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
      }
    public:
      GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 						     FermionOperator<Impl>  &_DenOp, 
 						     const Params & p
 						     ) : 
 	NumOp(_NumOp), 
 	DenOp(_DenOp), 
 	PhiOdd (_NumOp.FermionRedBlackGrid()),
 	PhiEven(_NumOp.FermionRedBlackGrid()),
 	param(p) 
      {
 	std::cout<<GridLogMessage << action_name() << " initialize: starting" << std::endl;
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	//Generate approximations for action eval
 	generateApprox(ApproxPowerAction, ApproxNegPowerAction, param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	generateApprox(ApproxHalfPowerAction, ApproxNegHalfPowerAction, 2*param.inv_pow, param.action_degree, param.action_tolerance, remez);
 	//Generate approximations for MD
 	if(param.md_degree != param.action_degree){ //note the CG tolerance is unrelated to the stopping condition of the Remez algorithm
 	  generateApprox(ApproxPowerMD, ApproxNegPowerMD, param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	  generateApprox(ApproxHalfPowerMD, ApproxNegHalfPowerMD, 2*param.inv_pow, param.md_degree, param.md_tolerance, remez);
 	}else{
 	  std::cout<<GridLogMessage << "Using same rational approximations for MD as for action evaluation" << std::endl;
 	  ApproxPowerMD = ApproxPowerAction; 
 	  ApproxNegPowerMD = ApproxNegPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegPowerMD.tolerances[i] = ApproxPowerMD.tolerances[i] = param.md_tolerance; //used for multishift
 	  ApproxHalfPowerMD = ApproxHalfPowerAction;
 	  ApproxNegHalfPowerMD = ApproxNegHalfPowerAction;
 	  for(int i=0;i<ApproxPowerMD.tolerances.size();i++)
 	    ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
 	}
 	std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
      };
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Power              : 1/" << param.inv_pow <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low                :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High               :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations     :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (Action) :" << param.action_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (Action)    :" << param.action_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance (MD)     :" << param.md_tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree (MD)        :" << param.md_degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision          :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      //Access the fermion field
      const FermionField &getPhiOdd() const{ return PhiOdd; }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField eta(NumOp.FermionGrid());	
 	// P(eta) \propto e^{- eta^dag eta}
 	//	
 	// The gaussian function draws from  P(x) \propto e^{- x^2 / 2 }    [i.e. sigma=1]
 	// Thus eta = x/sqrt{2} = x * sqrt(1/2)
 	RealD scale = std::sqrt(0.5);
 	gaussian(pRNG,eta);	eta=eta*scale;
 	refresh(U,eta);
      }
      //Allow for manual specification of random field for testing
      void refresh(const GaugeField &U, const FermionField &eta) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/(2*inv_pow) (MdagM)^-1/inv_pow (VdagV)^1/(2*inv_pow) phi}
 	//        = e^{- phi^dag  (VdagV)^1/(2*inv_pow) (MdagM)^-1/(2*inv_pow) (MdagM)^-1/(2*inv_pow)  (VdagV)^1/(2*inv_pow) phi}
 	//
 	// Phi =  (VdagV)^-1/(2*inv_pow) Mdag^{1/(2*inv_pow)} eta 
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	ImportGauge(U);
 	// MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (M^dag M)^{1/" << 2*param.inv_pow << "} eta" << std::endl;
 	multiShiftInverse(Denominator, ApproxHalfPowerAction, param.MaxIter, etaOdd, tmp);
 	// VdagV^-1/(2*inv_pow) MdagM^1/(2*inv_pow) eta
 	std::cout<<GridLogMessage << action_name() << " refresh: doing (V^dag V)^{-1/" << 2*param.inv_pow << "} ( (M^dag M)^{1/" << 2*param.inv_pow << "} eta)" << std::endl;
 	multiShiftInverse(Numerator, ApproxNegHalfPowerAction, param.MaxIter, tmp, PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
 	std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
 	ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerAction, param.MaxIter, PhiOdd,X);
 	// MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	std::cout<<GridLogMessage << action_name() << " compute action: doing (M^dag M)^{-1/" << 2*param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegHalfPowerAction, param.MaxIter, X,Y);
 	// Randomly apply rational bounds checks.
 	int rcheck = rand();
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
 	if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { 
 	  std::cout<<GridLogMessage << action_name() << " compute action: doing bounds check" << std::endl;
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	  std::cout<<GridLogMessage << action_name() << " compute action: checking high bounds" << std::endl;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  std::cout<<GridLogMessage << action_name() << " compute action: full approximation" << std::endl;
 	  InversePowerBoundsCheck(param.inv_pow,param.MaxIter,param.action_tolerance*100,MdagM,gauss,ApproxNegPowerAction);
 	  std::cout<<GridLogMessage << action_name() << " compute action: bounds check complete" << std::endl;
 	}
 	//  Phidag VdagV^1/(2*inv_pow) MdagM^-1/(2*inv_pow)  MdagM^-1/(2*inv_pow) VdagV^1/(2*inv_pow) Phi
 	RealD action = norm2(Y);
 	std::cout<<GridLogMessage << action_name() << " compute action: complete" << std::endl;
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	std::cout<<GridLogMessage << action_name() << " deriv: starting" << std::endl;
 	const int n_f  = ApproxNegPowerMD.poles.size();
 	const int n_pv = ApproxHalfPowerMD.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	ImportGauge(U);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} Phi" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, PhiOdd,MpvPhi_k,MpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (M^dag M)^{-1/" << param.inv_pow << "} ( (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Denominator, ApproxNegPowerMD, param.MaxIter, MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	std::cout<<GridLogMessage << action_name() << " deriv: doing (V^dag V)^{1/" << 2*param.inv_pow << "} ( (M^dag M)^{-1/" << param.inv_pow << "} (V^dag V)^{1/" << 2*param.inv_pow << "} Phi)" << std::endl;
 	multiShiftInverse(Numerator, ApproxHalfPowerMD, param.MaxIter, MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)	
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (1)" << std::endl;
 	for(int k=0;k<n_f;k++){
 	  ak = ApproxNegPowerMD.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	std::cout<<GridLogMessage << action_name() << " deriv: doing dS/dU part (2)+(3)" << std::endl;
 	for(int k=0;k<n_pv;k++){
          ak = ApproxHalfPowerMD.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
 	std::cout<<GridLogMessage << action_name() << " deriv: complete" << std::endl;
      };
    };
 NAMESPACE_END(Grid);
 #endif
@@ -1,93 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h
    Copyright (C) 2015
    Author: Christopher Kelly <ckelly@bnl.gov>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 NAMESPACE_BEGIN(Grid);
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm
    // cf. GeneralEvenOddRational.h for details
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class ImplD, class ImplF>
    class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
    private:
      typedef typename ImplD::FermionField FermionFieldD;
      typedef typename ImplF::FermionField FermionFieldF;
      FermionOperator<ImplD> & NumOpD;
      FermionOperator<ImplD> & DenOpD;
      FermionOperator<ImplF> & NumOpF;
      FermionOperator<ImplF> & DenOpF;
      Integer ReliableUpdateFreq;
    protected:
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	msCG(schurOpD, in, out_elems, out);
      }
      //Allow derived classes to override the gauge import
      virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
 	precisionChange(Uf, Ud);
 	NumOpD.ImportGauge(Ud);
 	DenOpD.ImportGauge(Ud);
 	NumOpF.ImportGauge(Uf);
 	DenOpF.ImportGauge(Uf);
      }
    public:
      GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD, 
 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF, 
 							      const RationalActionParams & p, Integer _ReliableUpdateFreq
 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
 								  ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){}
      virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
    };
 NAMESPACE_END(Grid);
 #endif
@@ -40,31 +40,249 @@ NAMESPACE_BEGIN(Grid);
    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
    template<class Impl>
-    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<Impl> {
+    class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
    private:
-      static RationalActionParams transcribe(const Params &in){
+     
-	RationalActionParams out;
+      FermionOperator<Impl> & NumOp;// the basic operator
-	out.inv_pow = 2;
+      FermionOperator<Impl> & DenOp;// the basic operator
-	out.lo = in.lo;
+      FermionField PhiEven; // the pseudo fermion field for this trajectory
-	out.hi = in.hi;
+      FermionField PhiOdd; // the pseudo fermion field for this trajectory
 	out.MaxIter = in.MaxIter;
 	out.action_tolerance = out.md_tolerance = in.tolerance;
 	out.action_degree = out.md_degree = in.degree;
 	out.precision = in.precision;
 	out.BoundsCheckFreq = in.BoundsCheckFreq;
 	return out;
      }
    public:
      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 							FermionOperator<Impl>  &_DenOp, 
 							const Params & p
 							) : 
 	GeneralEvenOddRatioRationalPseudoFermionAction<Impl>(_NumOp, _DenOp, transcribe(p)){}
-      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}      
+      OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					    FermionOperator<Impl>  &_DenOp, 
 					    Params & p
 					    ) : 
      NumOp(_NumOp), 
      DenOp(_DenOp), 
      PhiOdd (_NumOp.FermionRedBlackGrid()),
      PhiEven(_NumOp.FermionRedBlackGrid()),
      param(p) 
      {
 	AlgRemez remez(param.lo,param.hi,param.precision);
 	// MdagM^(+- 1/2)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
      virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
      virtual std::string LogParameters(){
 	std::stringstream sstream;
 	sstream << GridLogMessage << "["<<action_name()<<"] Low            :" << param.lo <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] High           :" << param.hi <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Max iterations :" << param.MaxIter <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Tolerance      :" << param.tolerance <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Degree         :" << param.degree <<  std::endl;
 	sstream << GridLogMessage << "["<<action_name()<<"] Precision      :" << param.precision <<  std::endl;
 	return sstream.str();
      }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
 	// P(phi) = e^{- phi^dag (VdagV)^1/4 (MdagM)^-1/2 (VdagV)^1/4 phi}
 	//        = e^{- phi^dag  (VdagV)^1/4 (MdagM)^-1/4 (MdagM)^-1/4  (VdagV)^1/4 phi}
 	//
 	// Phi =  (VdagV)^-1/4 Mdag^{1/4} eta 
 	//
 	// P(eta) = e^{- eta^dag eta}
 	//
 	// e^{x^2/2 sig^2} => sig^2 = 0.5.
 	// 
 	// So eta should be of width sig = 1/sqrt(2).
 	RealD scale = std::sqrt(0.5);
 	FermionField eta(NumOp.FermionGrid());
 	FermionField etaOdd (NumOp.FermionRedBlackGrid());
 	FermionField etaEven(NumOp.FermionRedBlackGrid());
 	FermionField     tmp(NumOp.FermionRedBlackGrid());
 	gaussian(pRNG,eta);	eta=eta*scale;
 	pickCheckerboard(Even,etaEven,eta);
 	pickCheckerboard(Odd,etaOdd,eta);
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	// MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerQuarter);
 	msCG_M(MdagM,etaOdd,tmp);
 	// VdagV^-1/4 MdagM^1/4 eta
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerNegQuarter);
 	msCG_V(VdagV,tmp,PhiOdd);
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);
 	PhiEven = Zero();
      };
      //////////////////////////////////////////////////////
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //////////////////////////////////////////////////////
      virtual RealD S(const GaugeField &U) {
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	FermionField X(NumOp.FermionRedBlackGrid());
 	FermionField Y(NumOp.FermionRedBlackGrid());
 	// VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	msCG_V(VdagV,PhiOdd,X);
 	// MdagM^-1/4 VdagV^1/4 Phi
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegQuarter);
 	msCG_M(MdagM,X,Y);
 	// Randomly apply rational bounds checks.
 	auto grid = NumOp.FermionGrid();
        auto r=rand();
        grid->Broadcast(0,r);
        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
 	  InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf);
 	}
 	//  Phidag VdagV^1/4 MdagM^-1/4  MdagM^-1/4 VdagV^1/4 Phi
 	RealD action = norm2(Y);
 	return action;
      };
      // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
      //
      // Here, M is some 5D operator and V is the Pauli-Villars field
      // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term
      //
      // Need  
      // dS_f/dU =  chi^dag d[P/Q]  N/D   P/Q  chi 
      //         +  chi^dag   P/Q d[N/D]  P/Q  chi 
      //         +  chi^dag   P/Q   N/D d[P/Q] chi 
      //
      // P/Q is expressed as partial fraction expansion: 
      // 
      //           a0 + \sum_k ak/(V^dagV + bk) 
      //  
      // d[P/Q] is then  
      //
      //          \sum_k -ak [V^dagV+bk]^{-1}  [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} 
      //  
      // and similar for N/D. 
      // 
      // Need   
      //       MpvPhi_k   = [Vdag V + bk]^{-1} chi  
      //       MpvPhi     = {a0 +  \sum_k ak [Vdag V + bk]^{-1} }chi   
      //   
      //       MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi  
      //       MfMpvPhi   = {a0 +  \sum_k ak [Mdag M + bk]^{-1} } MpvPhi
      // 
      //       MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi   
      //  
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
 	const int n_f  = PowerNegHalf.poles.size();
 	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
 	std::vector<FermionField> MfMpvPhi_k   (n_f ,NumOp.FermionRedBlackGrid());
 	FermionField      MpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField    MfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid());
 	FermionField           Y(NumOp.FermionRedBlackGrid());
 	GaugeField   tmp(NumOp.GaugeGrid());
 	NumOp.ImportGauge(U);
 	DenOp.ImportGauge(U);
 	SchurDifferentiableOperator<Impl> VdagV(NumOp);
 	SchurDifferentiableOperator<Impl> MdagM(DenOp);
 	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
 	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
 	msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi);
 	RealD ak;
 	dSdU = Zero();
 	// With these building blocks  
 	//  
 	//       dS/dU = 
 	//                 \sum_k -ak MfMpvPhi_k^dag      [ dM^dag M + M^dag dM ] MfMpvPhi_k         (1)
 	//             +   \sum_k -ak MpvMfMpvPhi_k^\dag  [ dV^dag V + V^dag dV ] MpvPhi_k           (2)
 	//                        -ak MpvPhi_k^dag        [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k      (3)
 	//(1)
 	for(int k=0;k<n_f;k++){
 	  ak = PowerNegHalf.residues[k];
 	  MdagM.Mpc(MfMpvPhi_k[k],Y);
 	  MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y );  dSdU=dSdU+ak*tmp;
 	  MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] );  dSdU=dSdU+ak*tmp;
 	}
 	//(2)
 	//(3)
 	for(int k=0;k<n_pv;k++){
          ak = PowerQuarter.residues[k];
 	  VdagV.Mpc(MpvPhi_k[k],Y);
 	  VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDeriv   (tmp,Y,MpvMfMpvPhi_k[k]);  dSdU=dSdU+ak*tmp;     
 	  VdagV.Mpc(MpvMfMpvPhi_k[k],Y);                // V as we take Ydag 
 	  VdagV.MpcDeriv   (tmp,Y, MpvPhi_k[k]); dSdU=dSdU+ak*tmp;
 	  VdagV.MpcDagDeriv(tmp,MpvPhi_k[k], Y); dSdU=dSdU+ak*tmp;
 	}
 	//dSdU = Ta(dSdU);
      };
    };
 NAMESPACE_END(Grid);
@@ -49,12 +49,10 @@ NAMESPACE_BEGIN(Grid);
      Params param;
      MultiShiftFunction PowerHalf   ;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegHalf;
      MultiShiftFunction PowerQuarter;
      MultiShiftFunction PowerNegQuarter;
      MultiShiftFunction MDPowerQuarter;
      MultiShiftFunction MDPowerNegHalf;
    private:
      FermionOperator<Impl> & NumOp;// the basic operator
@@ -75,13 +73,11 @@ NAMESPACE_BEGIN(Grid);
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
 	MDPowerNegHalf.Init(remez,param.mdtolerance,true);
 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
   	MDPowerQuarter.Init(remez,param.mdtolerance,false);
 	PowerNegQuarter.Init(remez,param.tolerance,true);
      };
@@ -208,8 +204,8 @@ NAMESPACE_BEGIN(Grid);
      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
-	const int n_f  = MDPowerNegHalf.poles.size();
+	const int n_f  = PowerNegHalf.poles.size();
-	const int n_pv = MDPowerQuarter.poles.size();
+	const int n_pv = PowerQuarter.poles.size();
 	std::vector<FermionField> MpvPhi_k     (n_pv,NumOp.FermionGrid());
 	std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid());
@@ -228,8 +224,8 @@ NAMESPACE_BEGIN(Grid);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagM(DenOp);
 	MdagMLinearOperator<FermionOperator<Impl> ,FermionField> VdagV(NumOp);
-	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,MDPowerQuarter);
+	ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
-	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,MDPowerNegHalf);
+	ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
 	msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi);
 	msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
@@ -248,7 +244,7 @@ NAMESPACE_BEGIN(Grid);
 	//(1)
 	for(int k=0;k<n_f;k++){
-	  ak = MDPowerNegHalf.residues[k];
+	  ak = PowerNegHalf.residues[k];
 	  DenOp.M(MfMpvPhi_k[k],Y);
 	  DenOp.MDeriv(tmp , MfMpvPhi_k[k], Y,DaggerYes );  dSdU=dSdU+ak*tmp;
 	  DenOp.MDeriv(tmp , Y, MfMpvPhi_k[k], DaggerNo );  dSdU=dSdU+ak*tmp;
@@ -258,7 +254,7 @@ NAMESPACE_BEGIN(Grid);
 	//(3)
 	for(int k=0;k<n_pv;k++){
-          ak = MDPowerQuarter.residues[k];
+          ak = PowerQuarter.residues[k];
 	  NumOp.M(MpvPhi_k[k],Y);
 	  NumOp.MDeriv(tmp,MpvMfMpvPhi_k[k],Y,DaggerYes); dSdU=dSdU+ak*tmp;
@@ -40,8 +40,6 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
@@ -75,22 +75,24 @@ NAMESPACE_BEGIN(Grid);
          conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
        };
-      virtual std::string action_name(){
+      virtual std::string action_name(){return "TwoFlavourEvenOddRatioPseudoFermionAction";}
 	std::stringstream sstream;
 	sstream<<"TwoFlavourEvenOddRatioPseudoFermionAction det("<<DenOp.Mass()<<") / det("<<NumOp.Mass()<<")";
 	return sstream.str();
      }
      virtual std::string LogParameters(){
 	std::stringstream sstream;
-	sstream<< GridLogMessage << "["<<action_name()<<"] -- No further parameters "<<std::endl;
+	sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
 	return sstream.str();
      } 
      const FermionField &getPhiOdd() const{ return PhiOdd; }
      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
        // NumOp == V
        // DenOp == M
        //
        // Take phi_o = Vpcdag^{-1} Mpcdag eta_o  ; eta_o = Mpcdag^{-1} Vpcdag Phi
        //
        // P(eta_o) = e^{- eta_o^dag eta_o}
        //
        // e^{x^2/2 sig^2} => sig^2 = 0.5.
@@ -98,22 +100,12 @@ NAMESPACE_BEGIN(Grid);
        RealD scale = std::sqrt(0.5);
        FermionField eta    (NumOp.FermionGrid());
        gaussian(pRNG,eta); eta = eta * scale;
 	refresh(U,eta);
      }
      void refresh(const GaugeField &U, const FermionField &eta) {
        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
        // NumOp == V
        // DenOp == M
        //
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
        FermionField etaEven(NumOp.FermionRedBlackGrid());
        FermionField tmp    (NumOp.FermionRedBlackGrid());
        gaussian(pRNG,eta);
        pickCheckerboard(Even,etaEven,eta);
        pickCheckerboard(Odd,etaOdd,eta);
@@ -132,6 +124,10 @@ NAMESPACE_BEGIN(Grid);
        // Even det factors
        DenOp.MooeeDag(etaEven,tmp);
        NumOp.MooeeInvDag(tmp,PhiEven);
        PhiOdd =PhiOdd*scale;
        PhiEven=PhiEven*scale;
      };
      //////////////////////////////////////////////////////
@@ -1,203 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class Impl>
 class TwoFlavourRatioEO4DPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);
 private:
  typedef FermionOperator<Impl> FermOp;
  FermionOperator<Impl> & NumOp;// the basic operator
  FermionOperator<Impl> & DenOp;// the basic operator
  OperatorFunction<FermionField> &DerivativeSolver;
  OperatorFunction<FermionField> &DerivativeDagSolver;
  OperatorFunction<FermionField> &ActionSolver;
  OperatorFunction<FermionField> &HeatbathSolver;
  FermionField phi4; // the pseudo fermion field for this trajectory
 public:
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & AS ) : 
    TwoFlavourRatioEO4DPseudoFermionAction(_NumOp,_DenOp, DS,DS,AS,AS) {};
  TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 					 FermionOperator<Impl>  &_DenOp, 
 					 OperatorFunction<FermionField> & DS,
 					 OperatorFunction<FermionField> & DDS,
 					 OperatorFunction<FermionField> & AS,
 					 OperatorFunction<FermionField> & HS
 				       ) : NumOp(_NumOp),
 					   DenOp(_DenOp),
 					   DerivativeSolver(DS),
 					   DerivativeDagSolver(DDS),
 					   ActionSolver(AS),
 					   HeatbathSolver(HS),
 					   phi4(_NumOp.GaugeGrid())
  {};
  virtual std::string action_name(){return "TwoFlavourRatioEO4DPseudoFermionAction";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
    return sstream.str();
  }  
  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
    // P(phi) = e^{- phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi}
    //
    // NumOp == V
    // DenOp == M
    //
    // Take phi = (V^{-1} M)_11 eta  ; eta = (M^{-1} V)_11 Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    FermionField eta4(NumOp.GaugeGrid());
    FermionField eta5(NumOp.FermionGrid());
    FermionField tmp(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    gaussian(pRNG,eta4);
    NumOp.ImportFourDimPseudoFermion(eta4,eta5);
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(HeatbathSolver);
    DenOp.M(eta5,tmp);               // M eta
    PrecSolve(NumOp,tmp,phi5);  // phi = V^-1 M eta
    phi5=phi5*scale;
    std::cout << GridLogMessage << "4d pf refresh "<< norm2(phi5)<<"\n";
    // Project to 4d
    NumOp.ExportFourDimPseudoFermion(phi5,phi4);
  };
  //////////////////////////////////////////////////////
  // S = phi^dag (V^dag M^-dag)_11  (M^-1 V)_11 phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField Y4(NumOp.GaugeGrid());
    FermionField X(NumOp.FermionGrid());
    FermionField Y(NumOp.FermionGrid());
    FermionField phi5(NumOp.FermionGrid());
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(ActionSolver);
    NumOp.ImportFourDimPseudoFermion(phi4,phi5);
    NumOp.M(phi5,X);              // X= V phi
    PrecSolve(DenOp,X,Y);    // Y= (MdagM)^-1 Mdag Vdag phi = M^-1 V phi
    NumOp.ExportFourDimPseudoFermion(Y,Y4);
    RealD action = norm2(Y4);
    return action;
  };
  //////////////////////////////////////////////////////
  // dS/du = 2 Re phi^dag (V^dag M^-dag)_11  (M^-1 d V)_11  phi
  //       - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField  X(NumOp.FermionGrid());
    FermionField  Y(NumOp.FermionGrid());
    FermionField       phi(NumOp.FermionGrid());
    FermionField      Vphi(NumOp.FermionGrid());
    FermionField  MinvVphi(NumOp.FermionGrid());
    FermionField      tmp4(NumOp.GaugeGrid());
    FermionField  MdagInvMinvVphi(NumOp.FermionGrid());
    GaugeField   force(NumOp.GaugeGrid());	
    //Y=V phi
    //X = (Mdag V phi
    //Y = (Mdag M)^-1 Mdag V phi = M^-1 V Phi
    NumOp.ImportFourDimPseudoFermion(phi4,phi);
    NumOp.M(phi,Vphi);               //  V phi
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(DerivativeSolver);
    PrecSolve(DenOp,Vphi,MinvVphi);// M^-1 V phi
    std::cout << GridLogMessage << "4d deriv solve "<< norm2(MinvVphi)<<"\n";
    // Projects onto the physical space and back
    NumOp.ExportFourDimPseudoFermion(MinvVphi,tmp4);
    NumOp.ImportFourDimPseudoFermion(tmp4,Y);
    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecDagSolve(DerivativeDagSolver);
    // X = proj M^-dag V phi
    // Need an adjoint solve
    PrecDagSolve(DenOp,Y,MdagInvMinvVphi);
    std::cout << GridLogMessage << "4d deriv solve dag "<< norm2(MdagInvMinvVphi)<<"\n";
    // phi^dag (Vdag Mdag^-1) (M^-1 dV)  phi
    NumOp.MDeriv(force ,MdagInvMinvVphi , phi, DaggerNo );  dSdU=force;
    // phi^dag (dVdag Mdag^-1) (M^-1 V)  phi
    NumOp.MDeriv(force , phi, MdagInvMinvVphi ,DaggerYes  );  dSdU=dSdU+force;
    //    - 2 Re phi^dag (dV^dag M^-dag)_11  (M^-1 dM M^-1 V)_11  phi
    DenOp.MDeriv(force,MdagInvMinvVphi,MinvVphi,DaggerNo);   dSdU=dSdU-force;
    DenOp.MDeriv(force,MinvVphi,MdagInvMinvVphi,DaggerYes);  dSdU=dSdU-force;
    dSdU *= -1.0; 
    //dSdU = - Ta(dSdU);
  };
 };
 NAMESPACE_END(Grid);
@@ -1,6 +0,0 @@
 #ifndef GRID_GPARITY_H_
 #define GRID_GPARITY_H_
 #include<Grid/qcd/gparity/GparityFlavour.h>
 #endif
@@ -1,34 +0,0 @@
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 const std::array<const GparityFlavour, 3> GparityFlavour::sigma_mu = {{
    GparityFlavour(GparityFlavour::Algebra::SigmaX),
    GparityFlavour(GparityFlavour::Algebra::SigmaY),
    GparityFlavour(GparityFlavour::Algebra::SigmaZ)
    }};
 const std::array<const GparityFlavour, 6> GparityFlavour::sigma_all = {{
  GparityFlavour(GparityFlavour::Algebra::Identity),
  GparityFlavour(GparityFlavour::Algebra::SigmaX),
  GparityFlavour(GparityFlavour::Algebra::SigmaY),
  GparityFlavour(GparityFlavour::Algebra::SigmaZ),
  GparityFlavour(GparityFlavour::Algebra::ProjPlus),
  GparityFlavour(GparityFlavour::Algebra::ProjMinus)
 }};
 const std::array<const char *, GparityFlavour::nSigma> GparityFlavour::name = {{
    "SigmaX",
    "MinusSigmaX",
    "SigmaY",
    "MinusSigmaY",
    "SigmaZ",
    "MinusSigmaZ",
    "Identity",
    "MinusIdentity",
    "ProjPlus",
    "MinusProjPlus",
    "ProjMinus",
    "MinusProjMinus"}};
 NAMESPACE_END(Grid);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	59282f25ec	Update to static data	2021-12-07 23:41:27 +00:00
Peter Boyle	b0bd173899	Update to memory manager, never have a Cpu Open in the LRU queue. Place as evict next on CPU closure.	2021-12-07 17:26:22 -05:00
		`@@ -1 +0,0 @@`
			`../CompactWilsonCloverFermionInstantiation.cc.master`