Merge branch 'feature/ddhmc' of https://github.com/paboyle/Grid into feature/ddhmc

Several updates
Correct mass
2025-11-03 21:44:33 +00:00 · 2022-02-14 17:33:17 +01:00 · 2022-02-14 17:29:41 +01:00 · 2021-11-17 21:40:04 +00:00 · 2021-10-07 20:06:55 +01:00 · 2021-10-07 20:06:17 +01:00
258 changed files with 22030 additions and 21217 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -34,6 +34,9 @@ directory
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 #if defined __GNUC__ && __GNUC__>=6
 #pragma GCC diagnostic ignored "-Wpsabi"
 #endif
 //disables and intel compiler specific warning (in json.hpp)
@@ -44,22 +47,14 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-#endif
+
 //Eigen only
 #endif
 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/GridStd.h
+++ b/Grid/GridStd.h
@@ -16,7 +16,6 @@
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -14,11 +14,7 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
 #endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -262,7 +262,7 @@ public:
 	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
-	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    int npoint = geom.npoint;
+    auto& geom_v = geom;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int point=0;point<npoint;point++){
+      for(int point=0;point<geom_v.npoint;point++){
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    int npoint = geom.npoint;
+    auto& geom_v = geom;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int p=0;p<npoint;p++){
+      for(int p=0;p<geom_v.npoint;p++){
        int point = points_p[p];
 	SE=Stencil_v.GetEntry(ptype,point,ss);
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -52,7 +52,6 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
  virtual ~LinearOperatorBase(){};
 };
@@ -224,9 +223,14 @@ class SchurOperatorBase :  public LinearOperatorBase<Field> {
    Mpc(in,tmp);
    MpcDag(tmp,out);
  }
  virtual  void MpcMpcDag(const Field &in, Field &out) {
    Field tmp(in.Grid());
    tmp.Checkerboard() = in.Checkerboard();
    MpcDag(in,tmp);
    Mpc(tmp,out);
  }
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    out.Checkerboard() = in.Checkerboard();
+    HermOp(in,out);
    MpcDagMpc(in,out);
    ComplexD dot= innerProduct(in,out); 
    n1=real(dot);
    n2=norm2(out);
@@ -277,6 +281,16 @@ template<class Matrix,class Field>
      axpy(out,-1.0,tmp,out);
    }
 };
 // Mpc MpcDag system presented as the HermOp
 template<class Matrix,class Field>
 class SchurDiagMooeeDagOperator :  public SchurDiagMooeeOperator<Matrix,Field> {
 public:
  virtual void HermOp(const Field &in, Field &out){
    out.Checkerboard() = in.Checkerboard();
    this->MpcMpcDag(in,out);
  }
  SchurDiagMooeeDagOperator (Matrix &Mat): SchurDiagMooeeOperator<Matrix,Field>(Mat){};
 };
 template<class Matrix,class Field>
  class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
 protected:
@@ -508,7 +522,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out) {
+  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
    assert(0);// Never need with staggered
  }
 };
@@ -586,7 +600,6 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -600,7 +613,6 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -30,19 +30,13 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template<class Field> using Preconditioner =  LinearFunction<Field> ;
 /*
 template<class Field> class Preconditioner :  public LinearFunction<Field> { 
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
 */
 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  using Preconditioner<Field>::operator();
+  void operator()(const Field &src, Field & psi){
  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -48,7 +48,6 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
  virtual ~SparseMatrixBase() {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -73,7 +72,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-  virtual ~CheckerBoardedSparseMatrixBase() {};
+
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -264,7 +264,7 @@ public:
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
-      accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
@@ -292,6 +292,7 @@ public:
 template<class Field>
 class ChebyshevLanczos : public Chebyshev<Field> {
 private:
  std::vector<RealD> Coeffs;
  int order;
  RealD alpha;
--- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
+++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h
@@ -37,7 +37,6 @@ template<class FieldD, class FieldF, typename std::enable_if< getPrecision<Field
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
  public:                                                
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -102,7 +102,7 @@ public:
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      TrueResidual = std::sqrt(a/ssq);
-      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
+      std::cout << GridLogMessage << "ConjugateGradient guess is converged already "<<TrueResidual<< " tol "<< Tolerance<< std::endl;
      IterationsToComplete = 0;	
      return;
    }
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -36,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -49,19 +48,29 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
-    MixedPrecisionConjugateGradient(RealD tol, 
+    MixedPrecisionConjugateGradient(RealD Tol,
 				    Integer maxinnerit, 
 				    Integer maxouterit, 
 				    GridBase* _sp_grid, 
 				    LinearOperatorBase<FieldF> &_Linop_f, 
 				    LinearOperatorBase<FieldD> &_Linop_d) :
      MixedPrecisionConjugateGradient(Tol, Tol, maxinnerit, maxouterit, _sp_grid, _Linop_f, _Linop_d) {};
    MixedPrecisionConjugateGradient(RealD Tol,
 				    RealD InnerTol,
 				    Integer maxinnerit, 
 				    Integer maxouterit, 
 				    GridBase* _sp_grid, 
 				    LinearOperatorBase<FieldF> &_Linop_f, 
 				    LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
-      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      Tolerance(Tol), InnerTolerance(InnerTol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
-      OuterLoopNormMult(100.), guesser(NULL){ };
+      OuterLoopNormMult(100.), guesser(NULL){ assert(InnerTol < 1.0e-1);};
    void useGuesser(LinearFunction<FieldF> &g){
      guesser = &g;
@@ -80,6 +89,11 @@ NAMESPACE_BEGIN(Grid);
    RealD stop = src_norm * Tolerance*Tolerance;
    GridBase* DoublePrecGrid = src_d_in.Grid();
    //Generate precision change workspaces
    precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
@@ -120,7 +134,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, wk_sp_from_dp);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@@ -138,7 +152,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, wk_dp_from_sp);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -150,6 +164,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -183,6 +183,9 @@ public:
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,411 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq
 				       ) : 
    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
    precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF r_f(SinglePrecGrid);
    FieldF p_f(SinglePrecGrid);
    FieldF tmp_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    FieldF src_f(SinglePrecGrid);
    precisionChange(src_f, src_d, wk_f_from_d);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    r_f=src_f; //residual maintained in single
    p_f=src_f;
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    //MdagM+m[0]
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    axpy(mmp_f,mass[0],p_f,mmp_f);
    RealD rn = norm2(p_f);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_f,b,mmp_f,r_f);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterations;k++){    
      a = c /cp;
      //Update double precision search direction by residual
      PrecChangeTimer.Start();
      precisionChange(r_d, r_f, wk_d_from_f);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f); 
      d=real(innerProduct(p_f,mmp_f));    
      MatrixTimer.Stop();  
      AXPYTimer.Start();
      axpy(mmp_f,mass[0],p_f,mmp_f);
      AXPYTimer.Stop();
      RealD rn = norm2(p_f);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
      AXPYTimer.Stop();
      c = c_f;
      if(k % ReliableUpdateFreq == 0){
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
 	PrecChangeTimer.Start();
 	precisionChange(r_f, r_d, wk_f_from_d);
 	PrecChangeTimer.Stop();
 	c = c_d;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsq[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged ){
 	SolverTimer.Stop();
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    //  assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -33,19 +33,16 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
@@ -60,7 +57,6 @@ private:
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@@ -91,7 +87,6 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
@@ -113,43 +108,7 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-
+};
  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
    int Nevec = (int)evec_coarse.size();
    int Nsrc = (int)src.size();
    // make temp variables
    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
    //Preporcessing
    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    guess_coarse[j] = Zero();
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockProject(src_coarse[j],src[j],subspace);
    }
    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
    for (int i=0;i<Nevec;i++)
    {
      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
      const CoarseField & tmp = evec_coarse[i];
      for (int j=0;j<Nsrc;j++)
      {
        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
      }
    }
    //postprocessing
    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockPromote(guess_coarse[j],guess[j],subspace);
    guess[j].Checkerboard() = src[j].Checkerboard();
    }
  };
  };
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -67,7 +67,6 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -98,7 +97,6 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-  using LinearFunction<Field>::operator();
+
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-  using LinearFunction<Field>::operator();
+
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,8 +119,7 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
-    ComplexD a, b;
+    ComplexD a, b, zAz;
    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;
@@ -147,7 +146,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    //    zAz = innerProduct(Az,psi);
+    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
@@ -171,7 +170,7 @@ public:
    LinalgTimer.Start();
-    //    zAz = innerProduct(Az,psi);
+    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    //p[0],q[0],qq[0] 
@@ -213,7 +212,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      //      zAz = innerProduct(Az,psi);
+      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);
      LinalgTimer.Start();
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -40,7 +40,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
-   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
+   * L^{-dag}= ( 1      -Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
@@ -82,7 +82,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
-   * TODO: Deflation 
+   *
   *
   */
 namespace Grid {
@@ -97,6 +98,7 @@ namespace Grid {
  protected:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
    bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
@@ -220,12 +222,19 @@ namespace Grid {
 	// Check unprec residual if possible
 	/////////////////////////////////////////////////
 	if ( ! subGuess ) {	  
-	  _Matrix.M(out[b],resid); 
+
 	  if ( this->adjoint() ) _Matrix.Mdag(out[b],resid); 
 	  else                   _Matrix.M(out[b],resid); 
 	  resid = resid-in[b];
 	  RealD ns = norm2(in[b]);
 	  RealD nr = norm2(resid);
-	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
 	  if ( this->adjoint() ) 
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	  else                   
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
 	}
@@ -279,12 +288,21 @@ namespace Grid {
      // Verify the unprec residual
      if ( ! subGuess ) {
-        _Matrix.M(out,resid); 
+
 	std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
 	if ( this->adjoint() ) _Matrix.Mdag(out,resid); 
 	else                   _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
-        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
+	  if ( this->adjoint() ) 
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
 	  else                   
 	    std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
      } else {
        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
@@ -293,6 +311,7 @@ namespace Grid {
    /////////////////////////////////////////////////////////////
    // Override in derived. 
    /////////////////////////////////////////////////////////////
    virtual bool adjoint(void) { return false; }
    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
@@ -646,6 +665,127 @@ namespace Grid {
        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
      }
  };
  /*
   * Red black Schur decomposition
   *
   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
   *                =         L                     D                     U
   *
   * L^-1 = (1              0 )
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
   *        (0    1           )
   * U^{dag} = ( 1                 0)
   *           (Meo^dag Mee^{-dag} 1)
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   *
   *
   ***********************
   *     M^dag psi = eta
   ***********************
   *
   * Really for Mobius: (Wilson - easier to just use gamma 5 hermiticity)
   *
   *    Mdag psi     =         Udag  Ddag  Ldag psi = eta
   *
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   *
   *
   * i)                D^dag phi =  (U^{-dag}  eta)
   *                        eta'_e = eta_e
   *                        eta'_o = (eta_o - Meo^dag Mee^{-dag} eta_e)
   * 
   *      phi_o = D_oo^-dag eta'_o = D_oo^-dag (eta_o - Meo^dag Mee^{-dag} eta_e)
   *
   *      phi_e = D_ee^-dag eta'_e = D_ee^-dag eta_e
   * 
   * Solve: 
   *
   *      D_oo D_oo^dag phi_o = D_oo (eta_o - Meo^dag Mee^{-dag} eta_e)
   *
   * ii) 
   *      phi = L^dag psi => psi = L^-dag phi. 
   *
   * L^{-dag} = ( 1      -Mee^{-dag} Moe^{dag} )
   *            ( 0       1                    )
   *
   *   => sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
   *   => sol_o = phi_o
   */
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal has Mooee on it, but solve the Adjoint system
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagMooeeDagSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    virtual bool adjoint(void) { return true; }
    SchurRedBlackDiagMooeeDagSolve(OperatorFunction<Field> &HermitianRBSolver,
 				   const bool initSubGuess = false,
 				   const bool _solnAsInitGuess = false)  
      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
    //////////////////////////////////////////////////////
    // Override RedBlack specialisation
    //////////////////////////////////////////////////////
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe^dag MeeInvDag source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInvDag(src_e,tmp);  assert(  tmp.Checkerboard() ==Even);
      _Matrix.MeooeDag   (tmp,Mtmp);   assert( Mtmp.Checkerboard() ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
      // get the right Mpc
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      _HermOpEO.Mpc(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);
    }
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field  sol_e(grid);
      Field  tmp(grid);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
      // sol_o = phi_o
      ///////////////////////////////////////////////////
      _Matrix.MeooeDag(sol_o,tmp);      assert(tmp.Checkerboard()==Even);
      tmp = src_e-tmp;                  assert(tmp.Checkerboard()==Even);
      _Matrix.MooeeInvDag(tmp,sol_e);   assert(sol_e.Checkerboard()==Even);
      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
    }
  };
 }
 #endif
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -159,6 +159,7 @@ void MemoryManager::Init(void)
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -170,7 +170,6 @@ private:
 public:
  static void Print(void);
  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -474,32 +474,6 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
  }
 }
 void MemoryManager::PrintState(void* _CpuPtr)
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if ( EntryPresent(CpuPtr) ){
    auto AccCacheIterator = EntryLookup(CpuPtr);
    auto & AccCache = AccCacheIterator->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
    << "\t" << AccCache.cpuLock
    << "\t" << AccCache.accLock
    << "\t" << AccCache.LRU_valid<<std::endl;
  } else {
    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
  }
 }
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -16,10 +16,6 @@ uint64_t  MemoryManager::DeviceToHostXfer;
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 void  MemoryManager::PrintState(void* CpuPtr)
 {
 std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 };
 void  MemoryManager::Print(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -388,21 +388,18 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    // TODO : make a OMP loop on CPU, call threaded bcopy
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    acceleratorCopySynchronise(); // MPI prob slower
  }
-  //  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
-  //    this->StencilSendToRecvFromComplete(list,dir);
+    this->StencilSendToRecvFromComplete(list,dir);
-  //  }
+  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
  acceleratorCopySynchronise();
  int nreq=list.size();
  if (nreq==0) return;
--- a/Grid/json/json.hpp
+++ b/Grid/json/json.hpp
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -88,13 +88,6 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
  // Helper function to print the state of this object in the AccCache
  void PrintCacheState(void)
  {
    MemoryManager::PrintState(this->_odata);
  }
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
@@ -2,14 +2,11 @@
    Grid physics library, www.github.com/paboyle/Grid 
-    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+    Source file: ./lib/lattice/Lattice_crc.h
-    Copyright (C) 2017 - 2022
+    Copyright (C) 2021
-    Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,19 +23,33 @@
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+*************************************************************************************/
 /*  END LEGAL */
-
+#pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
-#include "impl.h"
+template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
-template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
+{
-template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
+  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
  typedef typename vobj::tensor_reduced normtype;
  typedef typename normtype::scalar_object scalar;
  std::vector<scalar> sff;
  sliceSum(ff,sff,mu);
  for(int t=0;t<sff.size();t++){
    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
  }
 }
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -28,9 +28,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
 #if defined(GRID_SYCL)
 #include <Grid/lattice/Lattice_reduction_sycl.h>
 #endif
 NAMESPACE_BEGIN(Grid);
@@ -130,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
@@ -139,45 +136,20 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+#if defined(GRID_CUDA)||defined(GRID_HIP)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-  Integer osites = arg.Grid()->oSites();
+#if defined(GRID_CUDA)||defined(GRID_HIP)
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  typename vobj::scalar_object ssum;
  autoView( arg_v, arg, AcceleratorRead);
  ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu_large(&arg_v[0],osites);
+  auto ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
@@ -238,10 +210,11 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
-    // This code could read coalesce
+
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, 1,{
 	auto x_l = left_v[ss];
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 template <class Iterator>
-int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  int device;
 #ifdef GRID_CUDA
@@ -37,13 +37,14 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  /*  
+  
  std::cout << GridLogDebug << "GPU has:" << std::endl;
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  */  
+  
  if (warpSize != WARP_SIZE) {
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
    exit(EXIT_FAILURE);
@@ -51,14 +52,10 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
    return 0;
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
-  return 1;
+  
 }
 template <class sobj, class Iterator>
@@ -198,7 +195,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_objectD sobj;
  typedef decltype(lat) Iterator;
@@ -207,9 +204,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
-  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  assert(ok);
  Integer smemSize = numThreads * sizeof(sobj);
  Vector<sobj> buffer(numBlocks);
@@ -220,54 +215,6 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  auto result = buffer_v[0];
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,1,{
 	buf[ss] = dat[ss*words+w];
      });
    ret_p[w] = sumD_gpu_small(tbuf,osites);
  }
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  Integer nsimd= vobj::Nsimd();
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  if ( ok ) {
    ret = sumD_gpu_small(lat,osites);
  } else {
    ret = sumD_gpu_large(lat,osites);
  }
  return ret;
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -280,13 +227,6 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
  return result;
 }
-template <class vobj>
+
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -1,125 +0,0 @@
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
  sobj identity; zeroit(identity);
  sobj ret ; 
  Integer nsimd= vobj::Nsimd();
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
     cgh.parallel_for(cl::sycl::range<1>{osites},
 		      Reduction,
 		      [=] (cl::sycl::id<1> item, auto &sum) {
      auto osite   = item[0];
      sum +=Reduce(lat[osite]);
     });
   });
  theGridAccelerator->wait();
  ret = mysum[0];
  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  return sumD_gpu_tensor(lat,osites);
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
 {
  return sumD_gpu_large(lat,osites);
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  return sumD_gpu_large(lat,osites);
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu(lat,osites);
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
 /*
 template<class Double> Double svm_reduce(Double *vec,uint64_t L)
 {
  Double sumResult; zeroit(sumResult);
  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
  Double identity;  zeroit(identity);
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
 	 sum +=vec[index];
     });
   });
  theGridAccelerator->wait();
  Double ret = d_sum[0];
  free(d_sum,*theGridAccelerator);
  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_type  scalar;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobjD;
  sobjD ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int nsimd = vobj::Nsimd();
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<scalar> buffer(osites*nsimd);
  scalar *buf = &buffer[0];
  vector *dat = (vector *)lat;
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,nsimd,{
 	int lane = acceleratorSIMTlane(nsimd);
 	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
    });
    //Precision change at this point is to late to gain precision
    ret_p[w] = svm_reduce(buf,nsimd*osites);
  }
  return ret;
 }
 */
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -32,8 +32,9 @@
 #include <random>
 #ifdef RNG_SITMO
-#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#include <Grid/random/sitmo_prng_engine.hpp>
 #endif 
 #include <Grid/random/gaussian.h>
 #if defined(RNG_SITMO)
 #define RNG_FAST_DISCARD
@@ -142,7 +143,7 @@ public:
  std::vector<RngEngine>                             _generators;
  std::vector<std::uniform_real_distribution<RealD> > _uniform;
-  std::vector<std::normal_distribution<RealD> >       _gaussian;
+  std::vector<Grid::gaussian_distribution<RealD> >    _gaussian;
  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
  std::vector<std::uniform_int_distribution<uint32_t> > _uid;
@@ -243,7 +244,7 @@ public:
  GridSerialRNG() : GridRNGbase() {
    _generators.resize(1);
    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
  }
@@ -357,7 +358,7 @@ public:
    _generators.resize(_vol);
    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
-    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+    _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
  }
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -85,76 +85,6 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
  autoView(full_v, full, AcceleratorRead);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++) {
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(half_v[ssh],full_v(ss));
    }
  });
 }
 template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
  autoView(full_v , full, AcceleratorWrite);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++){
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(full_v[ss],half_v(ssh));
    }
  });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
@@ -855,7 +785,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@@ -1080,54 +1010,96 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
-//Convert a Lattice from one precision to another
+//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
-template<class VobjOut, class VobjIn>
+class precisionChangeWorkspace{
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+  std::pair<Integer,Integer>* fmap_device; //device pointer
-{
+public:
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
+  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
-  for(int d=0;d<out.Grid()->Nd();d++){
+    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+    assert(out_grid->Nd() == in_grid->Nd());
-  }
+    for(int d=0;d<out_grid->Nd();d++){
-  out.Checkerboard() = in.Checkerboard();
+      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
  GridBase *in_grid=in.Grid();
  GridBase *out_grid = out.Grid();
  typedef typename VobjOut::scalar_object SobjOut;
  typedef typename VobjIn::scalar_object SobjIn;
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
    out_icoor[lane].resize(ndim);
    out_grid->iCoorFromIindex(out_icoor[lane], lane);
  }
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
  autoView( out_v , out, CpuWrite);
  thread_for(out_oidx,out_grid->oSites(),{
    Coordinate out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
    ExtractPointerArray<SobjOut> ptrs(out_nsimd);      
    Coordinate lcoor(out_grid->Nd());
    for(int lane=0; lane < out_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
      ptrs[lane] = &in_slex_conv[llex];
    }
-    merge(out_v[out_oidx], ptrs, 0);
+    int Nsimd_out = out_grid->Nsimd();
-  });
+
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //Convert a lattice of one precision to another. The input workspace contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another
 //Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -69,6 +69,7 @@ GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
@@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
@@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("NoIntegrator"))  GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))         GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -182,6 +182,7 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -31,7 +31,6 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include <map>
 #include <pwd.h>
@@ -577,8 +576,6 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
  // in principle should do the line below, but that breaks backard compatibility with old data
  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
@@ -655,8 +652,7 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    const std::string stNC = std::to_string( Nc ) ;
+    ildgfmt.field     = std::string("su3gauge");
    ildgfmt.field          = std::string("su"+stNC+"gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -873,8 +869,7 @@ class IldgReader : public GridLimeReader {
    } else { 
      assert(found_ildgFormat);
-      const std::string stNC = std::to_string( Nc ) ;
+      assert ( ildgFormat_.field == std::string("su3gauge") );
      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -882,7 +877,7 @@ class IldgReader : public GridLimeReader {
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
+      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -6,8 +6,8 @@
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
+    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
+    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@@ -203,24 +203,20 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  assert( Nc < 4 && Nc > 1 ) ;
+  const int x=0;
  const int y=1;
  const int z=2;
  for(int mu=0;mu<Nd;mu++){
-    #if Nc == 2
+    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
+    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
+    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #else
      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #endif
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -282,6 +278,7 @@ struct GaugeSimpleMunger{
 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -320,8 +317,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
+      for(int i=0;i<2;i++){
-	for(int j=0;j<Nc;j++){
+	for(int j=0;j<3;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -333,8 +330,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<Nc-1;i++){
+      for(int i=0;i<2;i++){
-	for(int j=0;j<Nc;j++){
+	for(int j=0;j<3;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -9,7 +9,6 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,8 +30,6 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <string>
 NAMESPACE_BEGIN(Grid);
 using namespace Grid;
@@ -42,9 +39,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -148,17 +147,15 @@ public:
    std::string format(header.floating_point);
-    const int ieee32big = (format == std::string("IEEE32BIG"));
+    int ieee32big = (format == std::string("IEEE32BIG"));
-    const int ieee32    = (format == std::string("IEEE32"));
+    int ieee32    = (format == std::string("IEEE32"));
-    const int ieee64big = (format == std::string("IEEE64BIG"));
+    int ieee64big = (format == std::string("IEEE64BIG"));
-    const int ieee64    = (format == std::string("IEEE64") || \
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
 			   format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    const std::string stNC = std::to_string( Nc ) ;
+    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -169,7 +166,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
+    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -203,7 +200,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
@@ -214,29 +211,27 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"),
+					std::string ens_label = std::string("DWF"))
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
+    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"),
+					std::string ens_label = std::string("DWF"))
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
-    header.sequence_number = sequence_number;
+    ///////////////////////////////////////////
-    header.ensemble_id     = ens_id;
+    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
    header.hdr_version     = "1.0" ;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -250,14 +245,10 @@ public:
    uint64_t offset;
-    // Sod it -- always write NcxNc double
+    // Sod it -- always write 3x3 double
-    header.floating_point  = std::string("IEEE64BIG");
+    header.floating_point = std::string("IEEE64BIG");
-    const std::string stNC = std::to_string( Nc ) ;
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
-    if( two_row ) {
+    GaugeSimpleUnmunger<fobj3D,sobj> munge;
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
    } else {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -265,15 +256,8 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    if( two_row ) {
+    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-      Gauge3x2unmunger<fobj2D,sobj> munge;
+					      nersc_csum,scidac_csuma,scidac_csumb);
      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    } else {
      GaugeSimpleUnmunger<fobj3D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -305,7 +289,8 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);
-    uint64_t offset;
+	uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@@ -345,7 +330,7 @@ public:
    GridBase *grid = parallel.Grid();
-    uint64_t offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);
    FieldMetaData clone(header);
--- a/Grid/pugixml/pugixml.cc
+++ b/Grid/pugixml/pugixml.cc
@@ -16,12 +16,8 @@
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
 #endif
 #include "pugixml.h"
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -101,6 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
 template<typename vtype> using iLorentzVector             = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@@ -110,8 +114,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -158,7 +164,16 @@ typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
-// LorentzColour
+// LorentzVector
 typedef iLorentzVector<Complex  > LorentzVector;
 typedef iLorentzVector<ComplexF > LorentzVectorF;
 typedef iLorentzVector<ComplexD > LorentzVectorD;
 typedef iLorentzVector<vComplex > vLorentzVector;
 typedef iLorentzVector<vComplexF> vLorentzVectorF;
 typedef iLorentzVector<vComplexD> vLorentzVectorD;
 // LorentzColourMatrix
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
 typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
@@ -176,6 +191,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
 typedef iSpinVector<ComplexF>           SpinVectorF;
@@ -221,6 +246,16 @@ typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
 typedef iSinglet<ComplexF>         TComplexF;    // FIXME This is painful. Tensor singlet complex type.
@@ -263,6 +298,10 @@ typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
 typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
 typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzVector>  LatticeLorentzVector;
 typedef Lattice<vLorentzVectorF> LatticeLorentzVectorF;
 typedef Lattice<vLorentzVectorD> LatticeLorentzVectorD;
 // DoubleStored gauge field
 typedef Lattice<vDoubleStoredColourMatrix>  LatticeDoubleStoredColourMatrix;
 typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
@@ -451,20 +490,9 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
 //template <class Prop, class Ferm>
 #define FAST_FERM_TO_PROP
 template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
  autoView(p_v,p,AcceleratorWrite);
  autoView(f_v,f,AcceleratorRead);
  accelerator_for(idx,p_v.oSites(),1,{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
      }}
    });
 #else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -476,23 +504,12 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 	}
      pokeSpin(p, pjs, j, s);
    }
 #endif
 }
 //template <class Prop, class Ferm>
 template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
  autoView(p_v,p,AcceleratorWrite);
  autoView(f_v,f,AcceleratorRead);
  accelerator_for(idx,p_v.oSites(),1,{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
      }}
    });
 #else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -504,7 +521,6 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 	}
      pokeSpin(f, fj, j);
    }
 #endif
 }
 //////////////////////////////////////////////
--- a/Grid/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@@ -30,8 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef GRID_QCD_ACTION_H
+#pragma once
 #define GRID_QCD_ACTION_H
 ////////////////////////////////////////////
 // Abstract base interface
@@ -51,4 +50,4 @@ NAMESPACE_CHECK(Fermion);
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
 NAMESPACE_CHECK(PseudoFermion);
-#endif
+
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -40,6 +40,29 @@ class Action
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_num=0;
    deriv_norm_sum = deriv_max_sum=0.0;
  }
  void  deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
  RealD deriv_max_average(void)         { return deriv_max_sum/deriv_num; };
  RealD deriv_norm_average(void)        { return deriv_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return deriv_us; };
  RealD refresh_timer(void)      { return deriv_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  // Heatbath?
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
--- a/Grid/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -58,6 +58,8 @@ NAMESPACE_CHECK(Scalar);
 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/domains/Domains.h>
 #include <Grid/qcd/utils/Metric.h>
 NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -36,28 +36,34 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct GparityWilsonImplParams {
-  Coordinate twists;
+  Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
-  GparityWilsonImplParams() : twists(Nd, 0) {};
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  bool locally_periodic;
  GparityWilsonImplParams() : twists(Nd, 0), locally_periodic(false) {};
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  bool locally_periodic;
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
      locally_periodic = false;
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    locally_periodic = false;
  }
 };
 struct StaggeredImplParams {
-  StaggeredImplParams()  {};
+  bool locally_periodic;
  StaggeredImplParams() : locally_periodic(false) {};
 };
-  struct OneFlavourRationalParams : Serializable {
+struct OneFlavourRationalParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, 
 				    RealD, lo, 
 				    RealD, hi, 
@@ -86,6 +92,50 @@ struct StaggeredImplParams {
        BoundsCheckFreq(_BoundsCheckFreq){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/domains/DDHMCFilter.h
+++ b/Grid/qcd/action/domains/DDHMCFilter.h
@@ -0,0 +1,52 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/DDHMC.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////
 // DDHMC filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 template<typename MomentaField>
 struct DDHMCFilter: public MomentumFilterBase<MomentaField>
 {
  Coordinate Block;
  int Width;
  DDHMCFilter(const Coordinate &_Block): Block(_Block) {}
  void applyFilter(MomentaField &P) const override
  {
    DomainDecomposition Domains(Block);
    Domains.ProjectDDHMC(P);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DirichletFilter.h
+++ b/Grid/qcd/action/domains/DirichletFilter.h
@@ -0,0 +1,98 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/momentum/DirichletFilter.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 #include <Grid/qcd/action/domains/DomainDecomposition.h>
 NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct DirichletFilter: public MomentumFilterBase<MomentaField>
 {
  Coordinate Block;
  DirichletFilter(const Coordinate &_Block): Block(_Block) {}
  // Edge detect using domain projectors
  void applyFilter (MomentaField &U) const override
  {
    DomainDecomposition Domains(Block);
    GridBase *grid = U.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid);   one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger  omega(grid);
    LatticeInteger  omegabar(grid);
    LatticeInteger  tmp(grid);
    omega=one;    Domains.ProjectDomain(omega,0);
    omegabar=one; Domains.ProjectDomain(omegabar,1);
    LatticeInteger nface(grid); nface=Zero();
    MomentaField projected(grid); projected=Zero();
    typedef decltype(PeekIndex<LorentzIndex>(U,0)) MomentaLinkField;
    MomentaLinkField  Umu(grid);
    MomentaLinkField   zz(grid); zz=Zero();
    int dims = grid->Nd();
    Coordinate Global=grid->GlobalDimensions();
    assert(dims==Nd);
    for(int mu=0;mu<Nd;mu++){
      if ( Block[mu]!=0 ) {
 	Umu = PeekIndex<LorentzIndex>(U,mu);
 	// Upper face 
 	tmp = Cshift(omegabar,mu,1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	Umu = where(face,zz,Umu);
 	PokeIndex<LorentzIndex>(U, Umu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/DomainDecomposition.h
+++ b/Grid/qcd/action/domains/DomainDecomposition.h
@@ -0,0 +1,187 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/domains/DomainDecomposition.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 NAMESPACE_BEGIN(Grid);
 struct DomainDecomposition
 {
  Coordinate Block;
  static constexpr RealD factor = 0.6;
  DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);};
  template<class Field>
  void ProjectDomain(Field &f,Integer domain)
  {
    GridBase *grid = f.Grid();
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Field   zz(grid);  zz = Zero();
    LatticeInteger coor(grid);
    LatticeInteger domaincoor(grid);
    LatticeInteger mask(grid); mask = Integer(1);
    LatticeInteger zi(grid);     zi = Integer(0);
    for(int d=0;d<Nd;d++){
      Integer B= Block[d];
      if ( B ) {
 	LatticeCoordinate(coor,d+isDWF);
 	domaincoor = mod(coor,B);
 	mask = where(domaincoor==Integer(0),zi,mask);
 	mask = where(domaincoor==Integer(B-1),zi,mask);
      }
    }
    if ( !domain )
      f = where(mask==Integer(1),f,zz);
    else 
      f = where(mask==Integer(0),f,zz);
  };
  template<class GaugeField>
  void ProjectDDHMC(GaugeField &U)
  {
    GridBase *grid = U.Grid();
    Coordinate Global=grid->GlobalDimensions();
    GaugeField zzz(grid); zzz = Zero();
    LatticeInteger coor(grid); 
    GaugeField Uorg(grid); Uorg = U;
    auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
    ////////////////////////////////////////////////////
    // Zero BDY layers
    ////////////////////////////////////////////////////
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	LatticeCoordinate(coor,mu);
 	////////////////////////////////
 	// OmegaBar - zero all links contained in slice B-1,0 and
 	// mu links connecting to Omega
 	////////////////////////////////
 	U    = where(mod(coor,B1)==Integer(B1-1),zzz,U);
 	U    = where(mod(coor,B1)==Integer(0)   ,zzz,U); 
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); 
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    ////////////////////////////////////////////
    // Omega interior slow the evolution
    // Tricky as we need to take the smallest of values imposed by each cut
    // Do them in order or largest to smallest and smallest writes last
    ////////////////////////////////////////////
    RealD f= factor;
 #if 0    
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-5),Uorg*f,U); 
 	U = where(mod(coor,B1)==Integer(4)   ,Uorg*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-6),Uorg_mu*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(4)   ,Uorg_mu*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
 #endif
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-4),Uorg*f*f,U); 
 	U = where(mod(coor,B1)==Integer(3)   ,Uorg*f*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-5),Uorg_mu*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(3)   ,Uorg_mu*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-3),Uorg*f*f*f,U); 
 	U = where(mod(coor,B1)==Integer(2)   ,Uorg*f*f*f,U); 
 	// Perp links
       	U_mu = where(mod(coor,B1)==Integer(B1-4),Uorg_mu*f*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(2)   ,Uorg_mu*f*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
    for(int mu=0;mu<Nd;mu++) {
      Integer B1 = Block[mu];
      if ( B1 && (B1 <= Global[mu]) ) {
 	auto U_mu   = PeekIndex<LorentzIndex>(U,mu);
 	auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
 	// In the plane
 	U = where(mod(coor,B1)==Integer(B1-2),zzz,U); 
 	U = where(mod(coor,B1)==Integer(1)   ,zzz,U); 
 	// Perp links
 	U_mu = where(mod(coor,B1)==Integer(B1-3),Uorg_mu*f*f*f*f,U_mu);
 	U_mu = where(mod(coor,B1)==Integer(1)   ,Uorg_mu*f*f*f*f,U_mu);
 	PokeIndex<LorentzIndex>(U, U_mu, mu);
      }
    }
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/domains/Domains.h
+++ b/Grid/qcd/action/domains/Domains.h
@@ -0,0 +1,39 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/momentum/Domains.h
 Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 ////////////////////////////////////////////////////
 // Dirichlet filter with sub-block size B[mu]
 ////////////////////////////////////////////////////
 #pragma once 
 #include <Grid/qcd/action/domains/DomainDecomposition.h>
 #include <Grid/qcd/action/domains/MomentumFilter.h>
 #include <Grid/qcd/action/domains/DirichletFilter.h>
 #include <Grid/qcd/action/domains/DDHMCFilter.h>
--- a/Grid/qcd/hmc/integrators/MomentumFilter.h
+++ b/Grid/qcd/hmc/integrators/MomentumFilter.h
@@ -28,8 +28,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 //--------------------------------------------------------------------
-#ifndef MOMENTUM_FILTER
+#pragma once 
 #define MOMENTUM_FILTER
 NAMESPACE_BEGIN(Grid);
@@ -37,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
 template<typename MomentaField>
 struct MomentumFilterBase{
-  virtual void applyFilter(MomentaField &P) const;
+  virtual void applyFilter(MomentaField &P) const = 0;
 };
 //Do nothing
@@ -90,5 +89,3 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
 NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -60,6 +60,8 @@ public:
  ///////////////////////////////////////////////////////////////
  virtual void Dminus(const FermionField &psi, FermionField &chi);
  virtual void DminusDag(const FermionField &psi, FermionField &chi);
  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported);
  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported);
  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
  virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
  virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
@@ -68,16 +70,9 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
+  RealD Mass(void) { return mass; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
-    mass_plus=mass_minus=_mass; 
+    mass=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
    mass_plus=_mass_plus;
    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@@ -115,7 +110,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
  //    protected:
-  RealD mass_plus, mass_minus;
+  RealD mass;
  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -1,333 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 ////////////////////////////////////////////
 // Standard Clover
 //   (4+m0) + csw * clover_term
 // Exp Clover
 //   (4+m0) * exp(csw/(4+m0) clover_term)
 //   = (4+m0) + csw * clover_term + ...
 ////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////
 // Generic Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
    GridBase *grid = CloverTerm.Grid();
    CloverTerm += diag_mass;
    int lvol = grid->lSites();
    int DimRep = Impl::Dimension;
    {
      autoView(CTv,CloverTerm,CpuRead);
      autoView(CTIv,CloverTermInv,CpuWrite);
      thread_for(site, lvol, {
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
        peekLocalSite(Qx, CTv, lcoor);
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++){
                auto zz =  Qx()(j, k)(a, b);
                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
              }
        EigenInvCloverOp = EigenCloverOp.inverse();
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++)
                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
               pokeLocalSite(Qxinv, CTIv, lcoor);
      });
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Generic Exp Clover
 //////////////////////////////////
 template<class Impl>
 class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef WilsonCloverHelpers<Impl> Helpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 //////////////////////////////////
 // Compact Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
                            public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    Clover += diag_mass;
  }
  static void InvertClover(CloverField& InvClover,
                            const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverDiagonalField&       diagonalInv,
                            CloverTriangleField&       triangleInv,
                            bool fixedBoundaries) {
    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
  }
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Compact Exp Clover
 //////////////////////////////////
 template<class Impl>
 class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static void InvertClover(CloverField& InvClover,
                            const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverDiagonalField&       diagonalInv,
                            CloverTriangleField&       triangleInv,
                            bool fixedBoundaries) {
    if (fixedBoundaries)
    {
      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
    }
    else
    {
      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@@ -1,241 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
    Copyright (C) 2020 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer <nils.meyer@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 // see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
 //
 // Modifications done here:
 //
 // Original: clover term = 12x12 matrix per site
 //
 // But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
 // Sufficient to store/transfer only the real parts of the diagonal and one triangular part
 // 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
 //
 // Here: Above but diagonal as complex numbers, i.e., need to store/transfer
 // 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
 //
 // Words per site and improvement compared to original (combined with the input and output spinors):
 //
 // - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
 // - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
 // - Here:     2*12 + 42    =  66 words -> 2.55 x less
 //
 // These improvements directly translate to wall-clock time
 //
 // Data layout:
 //
 // - diagonal and triangle part as separate lattice fields,
 //   this was faster than as 1 combined field on all tested machines
 // - diagonal: as expected
 // - triangle: store upper right triangle in row major order
 // - graphical:
 //        0  1  2  3  4
 //           5  6  7  8
 //              9 10 11 = upper right triangle indices
 //                12 13
 //                   14
 //     0
 //        1
 //           2
 //              3       = diagonal indices
 //                 4
 //                    5
 //     0
 //     1  5
 //     2  6  9          = lower left triangle indices
 //     3  7 10 12
 //     4  8 11 13 14
 //
 // Impact on total memory consumption:
 // - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
 // - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
 template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
  /////////////////////////////////////////////
  // Sizes
  /////////////////////////////////////////////
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  /////////////////////////////////////////////
  // Type definitions
  /////////////////////////////////////////////
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonFermion<Impl>              WilsonBase;
  typedef WilsonCloverHelpers<Impl>        Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  /////////////////////////////////////////////
  // Constructors
  /////////////////////////////////////////////
 public:
  CompactWilsonCloverFermion(GaugeField& _Umu,
 			    GridCartesian& Fgrid,
 			    GridRedBlackCartesian& Hgrid,
 			    const RealD _mass,
 			    const RealD _csw_r = 0.0,
 			    const RealD _csw_t = 0.0,
 			    const RealD _cF = 1.0,
 			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
 			    const ImplParams& impl_p = ImplParams());
  /////////////////////////////////////////////
  // Member functions (implementing interface)
  /////////////////////////////////////////////
 public:
  virtual void Instantiatable() {};
  int          ConstEE()     override { return 0; };
  int          isTrivialEE() override { return 0; };
  void Dhop(const FermionField& in, FermionField& out, int dag) override;
  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
  void M(const FermionField& in, FermionField& out) override;
  void Mdag(const FermionField& in, FermionField& out) override;
  void Meooe(const FermionField& in, FermionField& out) override;
  void MeooeDag(const FermionField& in, FermionField& out) override;
  void Mooee(const FermionField& in, FermionField& out) override;
  void MooeeDag(const FermionField& in, FermionField& out) override;
  void MooeeInv(const FermionField& in, FermionField& out) override;
  void MooeeInvDag(const FermionField& in, FermionField& out) override;
  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  /////////////////////////////////////////////
  // Member functions (internals)
  /////////////////////////////////////////////
  void MooeeInternal(const FermionField&        in,
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle);
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
  void ImportGauge(const GaugeField& _Umu) override;
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
 private:
  template<class Field>
  const MaskField* getCorrectMaskField(const Field &in) const {
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        return &this->BoundaryMaskOdd;
      } else {
        return &this->BoundaryMaskEven;
      }
    } else {
      return &this->BoundaryMask;
    }
  }
  template<class Field>
  void ApplyBoundaryMask(Field& f) {
    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
    assert(m != nullptr);
    CompactHelpers::ApplyBoundaryMask(f, *m);
  }
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
 public:
  RealD csw_r;
  RealD csw_t;
  RealD cF;
  bool fixedBoundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
  FermionField Tmp;
  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/DirichletFermionOperator.h
+++ b/Grid/qcd/action/fermion/DirichletFermionOperator.h
@@ -0,0 +1,185 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/DirichletFermionOperator.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////
 // Wrap a fermion operator in Dirichlet BC's at node boundary
 ////////////////////////////////////////////////////////////////
 template<class Impl>
 class DirichletFermionOperator : public FermionOperator<Impl>
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  // Data members
  int CommsMode;
  Coordinate Block;
  DirichletFilter<GaugeField> Filter;
  FermionOperator<Impl> & FermOp;
  // Constructor / bespoke
  DirichletFermionOperator(FermionOperator<Impl> & _FermOp, Coordinate &_Block)
    : FermOp(_FermOp), Block(_Block), Filter(Block)
  {
    // Save what the comms mode should be under normal BCs
    CommsMode = WilsonKernelsStatic::Comms;
    assert((CommsMode == WilsonKernelsStatic::CommsAndCompute)
         ||(CommsMode == WilsonKernelsStatic::CommsThenCompute));
    // Check the block size divides local lattice
    GridBase *grid = FermOp.GaugeGrid();
    int blocks_per_rank = 1;
    Coordinate LocalDims = grid->LocalDimensions();
    Coordinate GlobalDims= grid->GlobalDimensions();
    assert(Block.size()==LocalDims.size());
    for(int d=0;d<LocalDims.size();d++){
      if (Block[d]&&(Block[d]<=GlobalDims[d])){
 	int r = LocalDims[d] % Block[d];
 	assert(r == 0);
 	blocks_per_rank *= (LocalDims[d] / Block[d]);
      }
    }
    // Even blocks per node required // could be relaxed but inefficient use of hardware as idle nodes in boundary operator R
    assert( blocks_per_rank != 0);
    // Possible checks that SIMD lanes are used with full occupancy???
  };
  virtual ~DirichletFermionOperator(void) = default;
  void DirichletOn(void)   {
    assert(WilsonKernelsStatic::Comms!= WilsonKernelsStatic::CommsDirichlet);
    //    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsDirichlet;
  }
  void DirichletOff(void)  {
    //    assert(WilsonKernelsStatic::Comms== WilsonKernelsStatic::CommsDirichlet);
    //    WilsonKernelsStatic::Comms = CommsMode;
  }
  // Implement the full interface
  virtual FermionField &tmp(void) { return FermOp.tmp(); };
  virtual GridBase *FermionGrid(void)         { return FermOp.FermionGrid(); }
  virtual GridBase *FermionRedBlackGrid(void) { return FermOp.FermionRedBlackGrid(); }
  virtual GridBase *GaugeGrid(void)           { return FermOp.GaugeGrid(); }
  virtual GridBase *GaugeRedBlackGrid(void)   { return FermOp.GaugeRedBlackGrid(); }
  // override multiply
  virtual void  M    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.M(in,out);    DirichletOff();  };
  virtual void  Mdag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mdag(in,out); DirichletOff();  };
  // half checkerboard operaions
  virtual void   Meooe       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Meooe(in,out);    DirichletOff(); };  
  virtual void   MeooeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MeooeDag(in,out); DirichletOff(); };
  virtual void   Mooee       (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mooee(in,out);    DirichletOff(); };
  virtual void   MooeeDag    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeDag(in,out); DirichletOff(); };
  virtual void   MooeeInv    (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInv(in,out); DirichletOff(); };
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInvDag(in,out); DirichletOff(); };
  // non-hermitian hopping term; half cb or both
  virtual void Dhop  (const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.Dhop(in,out,dag);    DirichletOff(); };
  virtual void DhopOE(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopOE(in,out,dag);  DirichletOff(); };
  virtual void DhopEO(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopEO(in,out,dag);  DirichletOff(); };
  virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp) { DirichletOn(); FermOp.DhopDir(in,out,dir,disp);  DirichletOff(); };
  // force terms; five routines; default to Dhop on diagonal
  virtual void MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MDeriv(mat,U,V,dag);};
  virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MoeDeriv(mat,U,V,dag);};
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeoDeriv(mat,U,V,dag);};
  virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MooDeriv(mat,U,V,dag);};
  virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeeDeriv(mat,U,V,dag);};
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDeriv(mat,U,V,dag);};
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivEO(mat,U,V,dag);};
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivOE(mat,U,V,dag);};
  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp){FermOp.Mdir(in,out,dir,disp);};
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)    {FermOp.MdirAll(in,out);};
  ///////////////////////////////////////////////
  // Updates gauge field during HMC
  ///////////////////////////////////////////////
  DoubledGaugeField &GetDoubledGaugeField(void){ return FermOp.GetDoubledGaugeField(); };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return FermOp.GetDoubledGaugeFieldE(); };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return FermOp.GetDoubledGaugeFieldO(); };
  virtual void ImportGauge(const GaugeField & _U)
  {
    GaugeField U = _U;
    // Filter gauge field to apply Dirichlet
    Filter.applyFilter(U);
    FermOp.ImportGauge(U);
  }
  ///////////////////////////////////////////////
  // Physical field import/export
  ///////////////////////////////////////////////
  virtual void Dminus(const FermionField &psi, FermionField &chi)    { FermOp.Dminus(psi,chi); }
  virtual void DminusDag(const FermionField &psi, FermionField &chi) { FermOp.DminusDag(psi,chi); }
  virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)   { FermOp.ImportFourDimPseudoFermion(input,imported);}
  virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported){ FermOp.ExportFourDimPseudoFermion(solution,exported);}
  virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)  { FermOp.ImportPhysicalFermionSource(input,imported);}
  virtual void ImportUnphysicalFermion(const FermionField &input,FermionField &imported)      { FermOp.ImportUnphysicalFermion(input,imported);}
  virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported) {FermOp.ExportPhysicalFermionSolution(solution,exported);}
  virtual void ExportPhysicalFermionSource(const FermionField &solution,FermionField &exported)   {FermOp.ExportPhysicalFermionSource(solution,exported);}
  //////////////////////////////////////////////////////////////////////
  // Should never be used
  //////////////////////////////////////////////////////////////////////
  virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {assert(0);}
  virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { assert(0);}
  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
 					PropagatorField &q_in_2,
 					PropagatorField &q_out,
 					PropagatorField &phys_src,
 					Current curr_type,
 					unsigned int mu)
  {assert(0);};
  virtual void SeqConservedCurrent(PropagatorField &q_in, 
 				   PropagatorField &q_out,
 				   PropagatorField &phys_src,
 				   Current curr_type,
 				   unsigned int mu,
 				   unsigned int tmin, 
 				   unsigned int tmax,
 				   ComplexField &lattice_cmplx)
  {assert(0);};
      // Only reimplemented in Wilson5D 
      // Default to just a zero correlation function
  virtual void ContractJ5q(FermionField &q_in   ,ComplexField &J5q) { J5q=Zero(); };
  virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=Zero(); };
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -53,7 +53,6 @@ NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -102,6 +101,12 @@ NAMESPACE_CHECK(WilsonTM5);
 #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
 #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 #include <Grid/qcd/action/fermion/MADWF.h>
 ////////////////////////////////////////////////////////////////////
 // DDHMC related 
 ////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/DirichletFermionOperator.h>
 #include <Grid/qcd/action/fermion/SchurFactoredFermionOperator.h>
 NAMESPACE_CHECK(DWFutils);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -138,52 +143,21 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
-template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
-template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
-typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
-typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
-typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
-typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
 typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
 typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
 typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -25,8 +25,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_CORE_H
+#pragma once
 #define  GRID_QCD_FERMION_CORE_H
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
@@ -45,4 +44,3 @@ NAMESPACE_CHECK(FermionOperator);
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
 NAMESPACE_CHECK(Kernels);
 #endif
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -140,6 +140,9 @@ public:
  // Updates gauge field during HMC
  ///////////////////////////////////////////////
  virtual void ImportGauge(const GaugeField & _U)=0;
  virtual DoubledGaugeField &GetDoubledGaugeField(void)  =0;
  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void)  =0;
  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void)  =0;
  //////////////////////////////////////////////////////////////////////
  // Conserved currents, either contract at sink or insert sequentially.
@@ -171,6 +174,16 @@ public:
      ///////////////////////////////////////////////
      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
      virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)
      {
 	imported = input;
      };
      virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported)
      {
 	exported=solution;
      };
      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
      {
 	imported = input;
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -30,6 +30,18 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+    //If this site is an global boundary site, perform the G-parity flavor twist
-
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -208,13 +233,18 @@ public:
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
    for(int mu=0;mu<Nd-1;mu++){
-      LatticeCoordinate(coor,mu);
+      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -229,7 +259,7 @@ public:
 	thread_foreach(ss,U_v,{
 	    Uds_v[ss](0)(mu) = U_v[ss]();
 	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
-	  });
+	});
      }
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
@@ -260,6 +290,38 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -300,27 +362,47 @@ public:
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls=Btilde.Grid()->_fdimensions[0];
    int Ls = Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      GridBase *GaugeGrid = mat.Grid();
-      autoView( Atilde_v , Atilde, CpuRead);
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
-      autoView( Btilde_v , Btilde, CpuRead);
+
-      thread_for(ss,tmp.Grid()->oSites(),{
+      if( Params.twists[mu] ){
-	  for (int s = 0; s < Ls; s++) {
+	LatticeCoordinate(coor,mu);
-	    int sF = s + Ls * ss;
+      }
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
+
-	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+      autoView( mat_v , mat, AcceleratorWrite);
-	  }
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-	});
+      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
 typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -141,8 +141,11 @@ public:
  void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
-  DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
  virtual DoubledGaugeField &GetU(void)   { return Umu ; } ;
  virtual DoubledGaugeField &GetUUU(void) { return UUUmu; };
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -160,16 +160,19 @@ public:
 			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
 			     const ImplParams &p= ImplParams());
-    // DoubleStore gauge field in operator
+  // DoubleStore gauge field in operator
-    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
  void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
-    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
+  void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
-    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
-    // Give a reference; can be used to do an assignment or copy back out after import
+  // Give a reference; can be used to do an assignment or copy back out after import
-    // if Carleton wants to cache them and not use the ImportSimple
+  // if Carleton wants to cache them and not use the ImportSimple
-    DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  virtual DoubledGaugeField &GetDoubledGaugeField(void)  override { return Umu; };
-    DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; };
-    void CopyGaugeCheckerboards(void);
+  virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  DoubledGaugeField &GetUUU(void) { return UUUmu; };
  void CopyGaugeCheckerboards(void);
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -135,6 +135,9 @@ public:
  // DoubleStore impl dependent
  void ImportGauge      (const GaugeField &_U );
  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
  void CopyGaugeCheckerboards(void);
--- a/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
+++ b/Grid/qcd/action/fermion/SchurFactoredFermionOperator.h
@@ -0,0 +1,534 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/SchurFactoredFermionOperator.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
 #include <Grid/qcd/action/domains/Domains.h>
 NAMESPACE_BEGIN(Grid);
  ////////////////////////////////////////////////////////
  // Some explanation of class structure for domain decomposition:
  //
  // Need a dirichlet operator for two flavour determinant - acts on both Omega and OmegaBar.
  //
  // Possible gain if the global sums and CG are run independently?? Could measure this.
  //
  // Types of operations
  //
  // 1) assemble local det dOmega det dOmegaBar pseudofermion
  //
  // - DirichletFermionOperator - can either do a global solve, or independent/per cell coefficients.
  //
  // 2) assemble dOmegaInverse and dOmegaBarInverse in R
  //
  // - DirichletFermionOperator - can also be used to 
  //                                       - need two or more cells per node. Options
  //                                       - a) solve one cell at a time, no new code, CopyRegion and reduced /split Grids
  //                                       - b) solve multiple cells in parallel. predicated dslash implementation
  //
  //                                       - b) has more parallelism, experience with block solver suggest might not be aalgorithmically inefficient
  //                                         a) has more cache friendly and easier code.
  //                                         b) is easy to implement in a "trial" or inefficient code with projection.
  //
  // 3)  Additional functionality for domain operations
  //
  // - SchurFactoredFermionOperator  - Need a DDHMC utility - whether used in two flavour or one flavour 
  //
  // - dBoundary - needs non-dirichlet operator
  // - Contains one Dirichlet Op, and one non-Dirichlet op. Implements dBoundary etc...
  // - The Dirichlet ops can be passed to dOmega(Bar) solvers etc...
  //
  ////////////////////////////////////////////////////////
 template<class ImplD,class ImplF>
 class SchurFactoredFermionOperator : public ImplD
 {
  INHERIT_IMPL_TYPES(ImplD);
  typedef typename ImplF::FermionField FermionFieldF;
  typedef typename ImplD::FermionField FermionFieldD;
  typedef SchurDiagMooeeOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorDagD;
  typedef SchurDiagMooeeDagOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorDagF;
  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
 							  FermionOperator<ImplF>,
 							  LinearOperatorD,
 							  LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
 							  FermionOperator<ImplF>,
 							  LinearOperatorDagD,
 							  LinearOperatorDagF> MxDagPCG;
 public:
  GridBase *FermionGrid(void) { return PeriodicFermOpD.FermionGrid(); };
  GridBase *GaugeGrid(void)   { return PeriodicFermOpD.GaugeGrid(); };
  FermionOperator<ImplD> & DirichletFermOpD;
  FermionOperator<ImplF> & DirichletFermOpF;
  FermionOperator<ImplD> & PeriodicFermOpD; 
  FermionOperator<ImplF> & PeriodicFermOpF; 
  LinearOperatorD DirichletLinOpD;
  LinearOperatorF DirichletLinOpF;
  LinearOperatorD PeriodicLinOpD;
  LinearOperatorF PeriodicLinOpF;
  LinearOperatorDagD DirichletLinOpDagD;
  LinearOperatorDagF DirichletLinOpDagF;
  LinearOperatorDagD PeriodicLinOpDagD;
  LinearOperatorDagF PeriodicLinOpDagF;
  // Can tinker with these in the pseudofermion for force vs. action solves
  Integer maxinnerit;
  Integer maxouterit;
  RealD tol;
  RealD tolinner;
  Coordinate Block;
  DomainDecomposition Domains;
  SchurFactoredFermionOperator(FermionOperator<ImplD>  & _PeriodicFermOpD,
 			       FermionOperator<ImplF>  & _PeriodicFermOpF,
 			       FermionOperator<ImplD>  & _DirichletFermOpD,
 			       FermionOperator<ImplF>  & _DirichletFermOpF,
 			       Coordinate &_Block)
    : Block(_Block), Domains(Block),
      PeriodicFermOpD(_PeriodicFermOpD),
      PeriodicFermOpF(_PeriodicFermOpF),
      DirichletFermOpD(_DirichletFermOpD),
      DirichletFermOpF(_DirichletFermOpF),
      DirichletLinOpD(DirichletFermOpD),
      DirichletLinOpF(DirichletFermOpF),
      PeriodicLinOpD(PeriodicFermOpD),
      PeriodicLinOpF(PeriodicFermOpF),
      DirichletLinOpDagD(DirichletFermOpD),
      DirichletLinOpDagF(DirichletFermOpF),
      PeriodicLinOpDagD(PeriodicFermOpD),
      PeriodicLinOpDagF(PeriodicFermOpF)
  {
    tol=1.0e-10;
    tolinner=1.0e-6;
    maxinnerit=1000;
    maxouterit=10;
    assert(PeriodicFermOpD.FermionGrid() == DirichletFermOpD.FermionGrid());
    assert(PeriodicFermOpF.FermionGrid() == DirichletFermOpF.FermionGrid());
  };
  enum Domain { Omega=0, OmegaBar=1 };
  void ImportGauge(const GaugeField &Umu)
  {
    // Single precision will update in the mixed prec CG
    PeriodicFermOpD.ImportGauge(Umu);
    GaugeField dUmu(Umu.Grid());
    dUmu=Umu;
    //    DirchletBCs(dUmu);
    DirichletFilter<GaugeField> Filter(Block);
    Filter.applyFilter(dUmu);
    DirichletFermOpD.ImportGauge(dUmu);
  }
 /*
  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
  {
    assert((sgn==1)||(sgn==-1));
    Real rsgn = sgn;
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
      Gamma::Algebra::GammaY,
      Gamma::Algebra::GammaZ,
      Gamma::Algebra::GammaT
    };
    GridBase *grid = f.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid); one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger nface(grid); nface=Zero();
    FermionField projected(grid); projected=Zero();
    FermionField sp_proj  (grid);
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Coordinate Global=grid->GlobalDimensions();
    for(int mu=0;mu<Nd;mu++){
      if ( Block[mu] <= Global[mu+isDWF] ) {
 	// need to worry about DWF 5th dim first
 	LatticeCoordinate(coor,mu+isDWF); 
 	face = where(mod(coor,Block[mu]) == Integer(0),one,zero );
 	nface = nface + face;
 	Gamma G(Gmu[mu]);
 	// Lower face receives (1-gamma)/2 in normal forward hopping term
 	sp_proj  = 0.5*(f-G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	//projected= where(face,f,projected);
 	face = where(mod(coor,Block[mu]) == Integer(Block[mu]-1) ,one,zero );
 	nface = nface + face;
 	// Upper face receives (1+gamma)/2 in normal backward hopping term
 	sp_proj = 0.5*(f+G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	//projected= where(face,f,projected);
      }
    }
    // Initial Zero() where nface==0.
    // Keep the spin projected faces where nface==1
    // Full spinor where nface>=2
    projected = where(nface>Integer(1),f,projected);
    f=projected;
  }
 */
  void ProjectBoundaryBothDomains (FermionField &f,int sgn)
  {
    assert((sgn==1)||(sgn==-1));
    Real rsgn = sgn;
    Gamma::Algebra Gmu [] = {
      Gamma::Algebra::GammaX,
      Gamma::Algebra::GammaY,
      Gamma::Algebra::GammaZ,
      Gamma::Algebra::GammaT
    };
    GridBase *grid = f.Grid();
    LatticeInteger  coor(grid);
    LatticeInteger  face(grid);
    LatticeInteger  one(grid);   one = 1;
    LatticeInteger  zero(grid); zero = 0;
    LatticeInteger  omega(grid);
    LatticeInteger  omegabar(grid);
    LatticeInteger  tmp(grid);
    omega=one;    Domains.ProjectDomain(omega,0);
    omegabar=one; Domains.ProjectDomain(omegabar,1);
    LatticeInteger nface(grid); nface=Zero();
    FermionField projected(grid); projected=Zero();
    FermionField sp_proj  (grid);
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    Coordinate Global=grid->GlobalDimensions();
    for(int mmu=0;mmu<Nd;mmu++){
      Gamma G(Gmu[mmu]);
      // need to worry about DWF 5th dim first
      int mu = mmu+isDWF;
      if ( Block[mmu] && (Block[mmu] <= Global[mu]) ) {
 	// Lower face receives (1-gamma)/2 in normal forward hopping term
 	tmp = Cshift(omegabar,mu,-1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,-1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	nface = nface + face;
 	sp_proj  = 0.5*(f-G*f*rsgn);
 	projected= where(face,sp_proj,projected);
 	// Upper face receives (1+gamma)/2 in normal backward hopping term
 	tmp = Cshift(omegabar,mu,1);
 	tmp = tmp + omega;
 	face = where(tmp == Integer(2),one,zero );
 	tmp = Cshift(omega,mu,1);
 	tmp = tmp + omegabar;
 	face = where(tmp == Integer(2),one,face );
 	nface = nface + face;
 	sp_proj = 0.5*(f+G*f*rsgn);
 	projected= where(face,sp_proj,projected);
      }
    }
    // Initial Zero() where nface==0.
    // Keep the spin projected faces where nface==1
    // Full spinor where nface>=2
    projected = where(nface>Integer(1),f,projected);
    f=projected;
  }
  void ProjectDomain(FermionField &f,int domain)
  {
 /*
    GridBase *grid = f.Grid();
    int dims = grid->Nd();
    int isDWF= (dims==Nd+1);
    assert((dims==Nd)||(dims==Nd+1));
    FermionField zz(grid); zz=Zero();
    LatticeInteger coor(grid);
    LatticeInteger domaincb(grid); domaincb=Zero();
    for(int d=0;d<Nd;d++){
      LatticeCoordinate(coor,d+isDWF);
      domaincb = domaincb + div(coor,Block[d]);
    }
    f = where(mod(domaincb,2)==Integer(domain),f,zz);
 */
    Domains.ProjectDomain(f,domain);
  };
  void ProjectOmegaBar   (FermionField &f) {ProjectDomain(f,OmegaBar);}
  void ProjectOmega      (FermionField &f) {ProjectDomain(f,Omega);}
  // See my notes(!).
  // Notation: Following Luscher, we introduce projectors $\hPdb$ with both spinor and space structure
  // projecting all spinor elements in $\Omega$ connected by $\Ddb$ to $\bar{\Omega}$,
  void ProjectBoundaryBar(FermionField &f)
  {
    ProjectBoundaryBothDomains(f,1);
    ProjectOmega(f);
  }
  // and $\hPd$ projecting all spinor elements in $\bar{\Omega}$ connected by $\Dd$ to $\Omega$.
  void ProjectBoundary   (FermionField &f)
  {
    ProjectBoundaryBothDomains(f,1);
    ProjectOmegaBar(f);
    //    DumpSliceNorm("ProjectBoundary",f,f.Grid()->Nd()-1);
  };
  void dBoundary    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    PeriodicFermOpD.M(tmp,out);
    ProjectOmega(out);
  };
  void dBoundaryDag (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    PeriodicFermOpD.Mdag(tmp,out);
    ProjectOmegaBar(out);
  };
  void dBoundaryBar (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    PeriodicFermOpD.M(tmp,out);
    ProjectOmegaBar(out);
  };
  void dBoundaryBarDag (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    PeriodicFermOpD.Mdag(tmp,out);
    ProjectOmega(out);
  };
  void dOmega       (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    DirichletFermOpD.M(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBar    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    DirichletFermOpD.M(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaDag       (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    DirichletFermOpD.Mdag(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBarDag    (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    DirichletFermOpD.Mdag(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaInv   (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    dOmegaInvAndOmegaBarInv(tmp,out); // Inefficient warning
    ProjectOmega(out);
  };
  void dOmegaBarInv(FermionField &in,FermionField &out)
  {    
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    dOmegaInvAndOmegaBarInv(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaDagInv   (FermionField &in,FermionField &out)
  {
    FermionField tmp(in);
    ProjectOmega(tmp);
    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
    ProjectOmega(out);
  };
  void dOmegaBarDagInv(FermionField &in,FermionField &out)
  {    
    FermionField tmp(in);
    ProjectOmegaBar(tmp);
    dOmegaDagInvAndOmegaBarDagInv(tmp,out);
    ProjectOmegaBar(out);
  };
  void dOmegaInvAndOmegaBarInv(FermionField &in,FermionField &out)
  {
    MxPCG OmegaSolver(tol,
 		      tolinner,
 		      maxinnerit,
 		      maxouterit,
 		      DirichletFermOpF.FermionRedBlackGrid(),
 		      DirichletFermOpF,
 		      DirichletFermOpD,
 		      DirichletLinOpF,
 		      DirichletLinOpD);
    SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(OmegaSolver);
    PrecSolve(DirichletFermOpD,in,out);
  };
  void dOmegaDagInvAndOmegaBarDagInv(FermionField &in,FermionField &out)
  {
    MxDagPCG OmegaDagSolver(tol,
 			    tolinner,
 			    maxinnerit,
 			    maxouterit,
 			    DirichletFermOpF.FermionRedBlackGrid(),
 			    DirichletFermOpF,
 			    DirichletFermOpD,
 			    DirichletLinOpDagF,
 			    DirichletLinOpDagD);
    SchurRedBlackDiagMooeeDagSolve<FermionField> PrecSolve(OmegaDagSolver);
    PrecSolve(DirichletFermOpD,in,out);
  };
  // Rdag = Pdbar - DdbarDag DomegabarDagInv  DdDag DomegaDagInv Pdbar 
  void RDag(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    FermionField tmp2(PeriodicFermOpD.FermionGrid());
    out = in;
    ProjectBoundaryBar(out);
    dOmegaDagInv(out,tmp1);   
    dBoundaryDag(tmp1,tmp2);   
    dOmegaBarDagInv(tmp2,tmp1);
    dBoundaryBarDag(tmp1,tmp2); 
    out = out - tmp2;
  };
  // R = Pdbar - Pdbar DomegaInv Dd DomegabarInv Ddbar
  void R(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    FermionField tmp2(PeriodicFermOpD.FermionGrid());
    out = in;
    ProjectBoundaryBar(out);
    dBoundaryBar(out,tmp1); 
    dOmegaBarInv(tmp1,tmp2);
    dBoundary(tmp2,tmp1);   
    dOmegaInv(tmp1,tmp2);   
    out = in - tmp2 ;       
    ProjectBoundaryBar(out);
    //    DumpSliceNorm("R",out,out.Grid()->Nd()-1);
  };
  // R = Pdbar - Pdbar Dinv Ddbar 
  void RInv(FermionField &in,FermionField &out)
  {
    FermionField tmp1(PeriodicFermOpD.FermionGrid());
    dBoundaryBar(in,out);
    Dinverse(out,tmp1);  
    out =in -tmp1; 
    ProjectBoundaryBar(out);
  };
  // R = Pdbar - DdbarDag DinvDag Pdbar 
  void RDagInv(FermionField &in,FermionField &out)
  {
    FermionField tmp(PeriodicFermOpD.FermionGrid());
    FermionField Pin(PeriodicFermOpD.FermionGrid());
    Pin = in; ProjectBoundaryBar(Pin);
    DinverseDag(Pin,out);  
    dBoundaryBarDag(out,tmp);
    out =Pin -tmp; 
  };
  // Non-dirichlet inverter using red-black preconditioning
  void Dinverse(FermionField &in,FermionField &out)
  {
    MxPCG DSolver(tol,
 		  tolinner,
 		  maxinnerit,
 		  maxouterit,
 		  PeriodicFermOpF.FermionRedBlackGrid(),
 		  PeriodicFermOpF,
 		  PeriodicFermOpD,
 		  PeriodicLinOpF,
 		  PeriodicLinOpD);
    SchurRedBlackDiagMooeeSolve<FermionField> Solve(DSolver);
    Solve(PeriodicFermOpD,in,out);
  }
  void DinverseDag(FermionField &in,FermionField &out)
  {
    MxDagPCG DdagSolver(tol,
 			tolinner,
 			maxinnerit,
 			maxouterit,
 			PeriodicFermOpF.FermionRedBlackGrid(),
 			PeriodicFermOpF,
 			PeriodicFermOpD,
 			PeriodicLinOpDagF,
 			PeriodicLinOpDagD);
    SchurRedBlackDiagMooeeDagSolve<FermionField> Solve(DdagSolver);
    Solve(PeriodicFermOpD,in,out);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -4,11 +4,10 @@
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
-    Copyright (C) 2017 - 2022
+    Copyright (C) 2017
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,9 +29,7 @@
 #pragma once
-#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@@ -52,16 +49,19 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
-template<class Impl, class CloverHelpers>
+template <class Impl>
-class WilsonCloverFermion : public WilsonFermion<Impl>,
+class WilsonCloverFermion : public WilsonFermion<Impl>
                            public WilsonCloverHelpers<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  INHERIT_CLOVER_TYPES(Impl);
+  template <typename vtype>
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
-  typedef WilsonFermion<Impl>       WilsonBase;
+public:
-  typedef WilsonCloverHelpers<Impl> Helpers;
+  typedef WilsonFermion<Impl> WilsonBase;
  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@@ -72,7 +72,42 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams());
+                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@@ -89,21 +124,250 @@ public:
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
  {
    conformable(X.Grid(), Y.Grid());
    conformable(X.Grid(), force.Grid());
    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
    GaugeField clover_force(force.Grid());
    PropagatorField Lambda(force.Grid());
-public:
+    // Guido: Here we are hitting some performance issues:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
    Impl::extractLinkField(U, this->Umu);
    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 protected:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverField CloverTerm, CloverTermInv;                     // Clover term
+  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
-  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
+  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
-  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
+  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
-  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
+  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
-  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
+  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
 };
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
      T_v[i]()(2, 3) = -F_v[i]()();
      T_v[i]()(3, 2) = F_v[i]()();
    });
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
      T_v[i]()(2, 3) = (F_v[i]()());
      T_v[i]()(3, 2) = -(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@@ -1,763 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 // Helper routines that implement common clover functionality
 NAMESPACE_BEGIN(Grid);
 template<class Impl> class WilsonCloverHelpers {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
  static CloverField fillCloverYZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
    });
    return T;
  }
  static CloverField fillCloverXY(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverYT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverZT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  template<class _Spinor>
  static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
    auto CC = coalescedRead(C);
    mult(&phi, &CC, &chi);
  }
  template<class _SpinorField>
  inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
    const int Nsimd = SiteSpinor::Nsimd();
    autoView(out_v, out, AcceleratorWrite);
    autoView(phi_v, phi, AcceleratorRead);
    autoView(C_v,   C,   AcceleratorRead);
    typedef decltype(coalescedRead(out_v[0])) calcSpinor;
    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
      calcSpinor tmp;
      multClover(tmp,C_v[sss],phi_v(sss));
      coalescedWrite(out_v[sss],tmp);
    });
  }
 };
 ////////////////////////////////////////////////////////
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  #if 0
  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #else
  template<typename vobj>
  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #endif
  static accelerator_inline int triangle_index(int i, int j) {
    if(i == j)
      return 0;
    else if(i < j)
      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
    else // i > j
      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
  }
  static void MooeeKernel_gpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(in_v,       in,       AcceleratorRead);
    autoView(out_v,      out,      AcceleratorWrite);
    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
    const uint64_t NN = Nsite * Ls;
    accelerator_for(ss, NN, Simd::Nsimd(), {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v(sF);
      auto diagonal_t = diagonal_v(sU);
      auto triangle_t = triangle_v(sU);
      for(int block=0; block<Nhs; block++) {
        int s_start = block*Nhs;
        for(int i=0; i<Nred; i++) {
          int si = s_start + i/Nc, ci = i%Nc;
          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
          for(int j=0; j<Nred; j++) {
            if (j == i) continue;
            int sj = s_start + j/Nc, cj = j%Nc;
            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
          };
        };
      };
      coalescedWrite(out_v[sF], res);
    });
  }
  static void MooeeKernel_cpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, CpuRead);
    autoView(triangle_v, triangle, CpuRead);
    autoView(in_v,       in,       CpuRead);
    autoView(out_v,      out,      CpuWrite);
    typedef SiteSpinor CalcSpinor;
 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
 #define PREFETCH_CLOVER(BASE) {                                     \
    uint64_t base;                                                  \
    int pf_dist_L1 = 1;                                             \
    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
                                                                    \
    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
    }                                                               \
                                                                    \
    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
    }                                                               \
  }
 // TODO: Implement/generalize this for other architectures
 // I played around a bit on KNL (see below) but didn't bring anything
 // #elif defined(AVX512)
 // #define PREFETCH_CLOVER(BASE) {                              \
 //     uint64_t base;                                           \
 //     int pf_dist_L1 = 1;                                      \
 //     int pf_dist_L2 = +4;                                     \
 //                                                              \
 //     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
 //     }                                                        \
 //                                                              \
 //     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
 //     }                                                        \
 //   }
 #else
 #define PREFETCH_CLOVER(BASE)
 #endif
    const uint64_t NN = Nsite * Ls;
    thread_for(ss, NN, {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v[sF];
      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
      auto triangle_t = triangle_v[sU];
      // upper half
      PREFETCH_CLOVER(0);
      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
      auto in_cc_1_0 = conjugate(in_t()(1)(0));
      auto in_cc_1_1 = conjugate(in_t()(1)(1));
      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
                  +           triangle_t()(0)( 0) * in_t()(0)(1)
                  +           triangle_t()(0)( 1) * in_t()(0)(2)
                  +           triangle_t()(0)( 2) * in_t()(1)(0)
                  +           triangle_t()(0)( 3) * in_t()(1)(1)
                  +           triangle_t()(0)( 4) * in_t()(1)(2);
      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
                  +           triangle_t()(0)( 5) * in_t()(0)(2)
                  +           triangle_t()(0)( 6) * in_t()(1)(0)
                  +           triangle_t()(0)( 7) * in_t()(1)(1)
                  +           triangle_t()(0)( 8) * in_t()(1)(2)
                  + conjugate(       res()(0)( 1));
      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
                  +           triangle_t()(0)( 5) * in_cc_0_1;
      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
                  +           triangle_t()(0)( 9) * in_t()(1)(0)
                  +           triangle_t()(0)(10) * in_t()(1)(1)
                  +           triangle_t()(0)(11) * in_t()(1)(2)
                  + conjugate(       res()(0)( 2));
      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
                  +           triangle_t()(0)( 6) * in_cc_0_1
                  +           triangle_t()(0)( 9) * in_cc_0_2;
      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
                  +           triangle_t()(0)(12) * in_t()(1)(1)
                  +           triangle_t()(0)(13) * in_t()(1)(2)
                  + conjugate(       res()(1)( 0));
      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
                  +           triangle_t()(0)( 7) * in_cc_0_1
                  +           triangle_t()(0)(10) * in_cc_0_2
                  +           triangle_t()(0)(12) * in_cc_1_0;
      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
                  +           triangle_t()(0)(14) * in_t()(1)(2)
                  + conjugate(       res()(1)( 1));
      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
                  +           triangle_t()(0)( 8) * in_cc_0_1
                  +           triangle_t()(0)(11) * in_cc_0_2
                  +           triangle_t()(0)(13) * in_cc_1_0
                  +           triangle_t()(0)(14) * in_cc_1_1;
      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
                  + conjugate(       res()(1)( 2));
      vstream(out_v[sF]()(0)(0), res()(0)(0));
      vstream(out_v[sF]()(0)(1), res()(0)(1));
      vstream(out_v[sF]()(0)(2), res()(0)(2));
      vstream(out_v[sF]()(1)(0), res()(1)(0));
      vstream(out_v[sF]()(1)(1), res()(1)(1));
      vstream(out_v[sF]()(1)(2), res()(1)(2));
      // lower half
      PREFETCH_CLOVER(1);
      auto in_cc_2_0 = conjugate(in_t()(2)(0));
      auto in_cc_2_1 = conjugate(in_t()(2)(1));
      auto in_cc_2_2 = conjugate(in_t()(2)(2));
      auto in_cc_3_0 = conjugate(in_t()(3)(0));
      auto in_cc_3_1 = conjugate(in_t()(3)(1));
      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
                  +           triangle_t()(1)( 0) * in_t()(2)(1)
                  +           triangle_t()(1)( 1) * in_t()(2)(2)
                  +           triangle_t()(1)( 2) * in_t()(3)(0)
                  +           triangle_t()(1)( 3) * in_t()(3)(1)
                  +           triangle_t()(1)( 4) * in_t()(3)(2);
      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
                  +           triangle_t()(1)( 5) * in_t()(2)(2)
                  +           triangle_t()(1)( 6) * in_t()(3)(0)
                  +           triangle_t()(1)( 7) * in_t()(3)(1)
                  +           triangle_t()(1)( 8) * in_t()(3)(2)
                  + conjugate(       res()(2)( 1));
      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
                  +           triangle_t()(1)( 5) * in_cc_2_1;
      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
                  +           triangle_t()(1)( 9) * in_t()(3)(0)
                  +           triangle_t()(1)(10) * in_t()(3)(1)
                  +           triangle_t()(1)(11) * in_t()(3)(2)
                  + conjugate(       res()(2)( 2));
      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
                  +           triangle_t()(1)( 6) * in_cc_2_1
                  +           triangle_t()(1)( 9) * in_cc_2_2;
      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
                  +           triangle_t()(1)(12) * in_t()(3)(1)
                  +           triangle_t()(1)(13) * in_t()(3)(2)
                  + conjugate(       res()(3)( 0));
      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
                  +           triangle_t()(1)( 7) * in_cc_2_1
                  +           triangle_t()(1)(10) * in_cc_2_2
                  +           triangle_t()(1)(12) * in_cc_3_0;
      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
                  +           triangle_t()(1)(14) * in_t()(3)(2)
                  + conjugate(       res()(3)( 1));
      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
                  +           triangle_t()(1)( 8) * in_cc_2_1
                  +           triangle_t()(1)(11) * in_cc_2_2
                  +           triangle_t()(1)(13) * in_cc_3_0
                  +           triangle_t()(1)(14) * in_cc_3_1;
      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
                  + conjugate(       res()(3)( 2));
      vstream(out_v[sF]()(2)(0), res()(2)(0));
      vstream(out_v[sF]()(2)(1), res()(2)(1));
      vstream(out_v[sF]()(2)(2), res()(2)(2));
      vstream(out_v[sF]()(3)(0), res()(3)(0));
      vstream(out_v[sF]()(3)(1), res()(3)(1));
      vstream(out_v[sF]()(3)(2), res()(3)(2));
    });
  }
  static void MooeeKernel(int                        Nsite,
                          int                        Ls,
                          const FermionField&        in,
                          FermionField&              out,
                          const CloverDiagonalField& diagonal,
                          const CloverTriangleField& triangle) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)
    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
 #else
    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
 #endif
  }
  static void Invert(const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle,
                     CloverDiagonalField&       diagonalInv,
                     CloverTriangleField&       triangleInv) {
    conformable(diagonal, diagonalInv);
    conformable(triangle, triangleInv);
    conformable(diagonal, triangle);
    diagonalInv.Checkerboard() = diagonal.Checkerboard();
    triangleInv.Checkerboard() = triangle.Checkerboard();
    GridBase* grid = diagonal.Grid();
    long lsites = grid->lSites();
    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
    autoView(diagonal_v,  diagonal,  CpuRead);
    autoView(triangle_v,  triangle,  CpuRead);
    autoView(diagonalInv_v, diagonalInv, CpuWrite);
    autoView(triangleInv_v, triangleInv, CpuWrite);
    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      scalar_object_diagonal diagonal_tmp     = Zero();
      scalar_object_diagonal diagonal_inv_tmp = Zero();
      scalar_object_triangle triangle_tmp     = Zero();
      scalar_object_triangle triangle_inv_tmp = Zero();
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
      peekLocalSite(triangle_tmp, triangle_v, lcoor);
      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
              else
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
            }
          }
        }
      }
      clover_inv_eigen = clover_eigen.inverse();
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else if(i < j)
                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else
                continue;
            }
          }
        }
      }
      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
    });
  }
  static void ConvertLayout(const CloverField&   full,
                            CloverDiagonalField& diagonal,
                            CloverTriangleField& triangle) {
    conformable(full, diagonal);
    conformable(full, triangle);
    diagonal.Checkerboard() = full.Checkerboard();
    triangle.Checkerboard() = full.Checkerboard();
    autoView(full_v,     full,     AcceleratorRead);
    autoView(diagonal_v, diagonal, AcceleratorWrite);
    autoView(triangle_v, triangle, AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else if(i < j)
                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else
                continue;
            }
          }
        }
      }
    });
  }
  static void ConvertLayout(const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverField&               full) {
    conformable(full, diagonal);
    conformable(full, triangle);
    full.Checkerboard() = diagonal.Checkerboard();
    full = Zero();
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(full_v,     full,     AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
              else
                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
            }
          }
        }
      }
    });
  }
  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
    // Checks/grid
    double t0 = usecond();
    conformable(diagonal, triangle);
    GridBase* grid = diagonal.Grid();
    // Determine the boundary coordinates/sites
    double t1 = usecond();
    int t_dir = Nd - 1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    // Set off-diagonal parts at boundary to zero -- OK
    double t2 = usecond();
    CloverTriangleField zeroTriangle(grid);
    zeroTriangle.Checkerboard() = triangle.Checkerboard();
    zeroTriangle = Zero();
    triangle = where(t_coor == 0,   zeroTriangle, triangle);
    triangle = where(t_coor == T-1, zeroTriangle, triangle);
    // Set diagonal to unity (scaled correctly) -- OK
    double t3 = usecond();
    CloverDiagonalField tmp(grid);
    tmp.Checkerboard() = diagonal.Checkerboard();
    tmp                = -1.0 * csw_t + diag_mass;
    diagonal           = where(t_coor == 0,   tmp, diagonal);
    diagonal           = where(t_coor == T-1, tmp, diagonal);
    // Correct values next to boundary
    double t4 = usecond();
    if(cF != 1.0) {
      tmp = cF - 1.0;
      tmp += diagonal;
      diagonal = where(t_coor == 1,   tmp, diagonal);
      diagonal = where(t_coor == T-2, tmp, diagonal);
    }
    // Report timings
    double t5 = usecond();
 #if 0
    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
              << " checks = "          << (t1 - t0) / 1e6
              << ", coordinate = "     << (t2 - t1) / 1e6
              << ", off-diag zero = "  << (t3 - t2) / 1e6
              << ", diagonal unity = " << (t4 - t3) / 1e6
              << ", near-boundary = "  << (t5 - t4) / 1e6
              << ", total = "          << (t5 - t0) / 1e6
              << std::endl;
 #endif
  }
  template<class Field, class Mask>
  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
    conformable(f, m);
    auto grid  = f.Grid();
    const uint32_t Nsite = grid->oSites();
    const uint32_t Nsimd = grid->Nsimd();
    autoView(f_v, f, AcceleratorWrite);
    autoView(m_v, m, AcceleratorRead);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, Nsite, Nsimd, {
      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
    });
  }
  template<class MaskField>
  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
    assert(!full.Grid()->_isCheckerBoarded);
    GridBase* grid = full.Grid();
    int t_dir = Nd-1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    MaskField zeroMask(grid); zeroMask = Zero();
    full = 1.0;
    full = where(t_coor == 0,   zeroMask, full);
    full = where(t_coor == T-1, zeroMask, full);
    pickCheckerboard(Even, even, full);
    pickCheckerboard(Odd,  odd,  full);
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@@ -1,90 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 class WilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteClover;
  typedef Lattice<SiteClover> CloverField;
 };
 template<class Impl>
 class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
  typedef iSinglet<Simd>            SiteMask;
  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
  typedef Lattice<SiteMask>           MaskField;
 };
 #define INHERIT_CLOVER_TYPES(Impl)                                 \
  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
 #define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
  /* ugly duplication but needed inside functionality classes */ \
  template<typename vtype> using iImplCloverDiagonal = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
  template<typename vtype> using iImplCloverTriangle = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
 #define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -117,19 +117,19 @@ public:
    typedef decltype(coalescedRead(*in))    sobj;
    typedef decltype(coalescedRead(*out0)) hsobj;
-    constexpr unsigned int Nsimd = vobj::Nsimd();
+    unsigned int Nsimd = vobj::Nsimd();
    unsigned int mask = Nsimd >> (type + 1);
    int lane = acceleratorSIMTlane(Nsimd);
    int j0 = lane &(~mask); // inner coor zero
    int j1 = lane |(mask) ; // inner coor one
-    const vobj *vp0 = &in[k];  // out0[j] = merge low bit of type from in[k] and in[m] 
+    const vobj *vp0 = &in[k];
-    const vobj *vp1 = &in[m];  // out1[j] = merge hi  bit of type from in[k] and in[m]
+    const vobj *vp1 = &in[m];
-    const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
+    const vobj *vp = (lane&mask) ? vp1:vp0;
-    auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
+    auto sa = coalescedRead(*vp,j0);
-    auto sb = coalescedRead(*vp,j1); // lane to read for out 1
+    auto sb = coalescedRead(*vp,j1);
    hsobj psa, psb;
-    projector::Proj(psa,sa,mu,dag);  // spin project the result0
+    projector::Proj(psa,sa,mu,dag);
-    projector::Proj(psb,sb,mu,dag);  // spin project the result1
+    projector::Proj(psb,sb,mu,dag);
    coalescedWrite(out0[j],psa);
    coalescedWrite(out1[j],psb);
 #else
@@ -303,8 +303,10 @@ public:
 		int npoints,
 		int checkerboard,
 		const std::vector<int> &directions,
-		const std::vector<int> &distances,Parameters p)  
+		const std::vector<int> &distances,
-    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
+		bool locally_periodic,
 		Parameters p)  
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,locally_periodic,p)
  {
    ZeroCountersi();
    surface_list.resize(0);
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -146,8 +146,11 @@ public:
  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
  void DhopInternalDirichletComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 				  const FermionField &in, FermionField &out, int dag);
  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-                    const FermionField &in, FermionField &out, int dag);
+				   const FermionField &in, FermionField &out, int dag);
  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
@@ -157,6 +160,9 @@ public:
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -166,6 +166,13 @@ public:
 			       FermionField &out,
 			       int dag);
  void DhopInternalDirichletComms(StencilImpl & st,
 				  LebesgueOrder &lo,
 				  DoubledGaugeField &U,
 				  const FermionField &in, 
 				  FermionField &out,
 				  int dag);
  // Constructors
  WilsonFermion5D(GaugeField &_Umu,
 		  GridCartesian         &FiveDimGrid,
@@ -174,19 +181,11 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
  // Constructors
  /*
    WilsonFermion5D(int simd, 
    GaugeField &_Umu,
    GridCartesian         &FiveDimGrid,
    GridRedBlackCartesian &FiveDimRedBlackGrid,
    GridCartesian         &FourDimGrid,
    double _M5,const ImplParams &p= ImplParams());
  */
  // DoubleStore
  void ImportGauge(const GaugeField &_Umu);
-    
+  DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
  DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
  DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
 class WilsonKernelsStatic { 
 public:
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
-  enum { CommsAndCompute, CommsThenCompute };
+  enum { CommsAndCompute, CommsThenCompute, CommsDirichlet };
  static int Opt;  
  static int Comms;
 };
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass_plus(_mass), mass_minus(_mass)
+  mass(_mass)
 { 
 }
@@ -112,7 +112,6 @@ void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  imported5d=tmp;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
 {
@@ -127,6 +126,37 @@ void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &inpu
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  Dminus(tmp,imported5d);
 }
 ////////////////////////////////////////////////////
 // Added for fourD pseudofermion det estimation
 ////////////////////////////////////////////////////
 template<class Impl>  
 void CayleyFermion5D<Impl>::ImportFourDimPseudoFermion(const FermionField &input4d,FermionField &imported5d)
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  conformable(imported5d.Grid(),this->FermionGrid());
  conformable(input4d.Grid()   ,this->GaugeGrid());
  tmp = Zero();
  InsertSlice(input4d, tmp, 0   , 0);
  InsertSlice(input4d, tmp, Ls-1, 0);
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, 0, 0);
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
  imported5d=tmp;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::ExportFourDimPseudoFermion(const FermionField &solution5d,FermionField &exported4d)
 {
  int Ls = this->Ls;
  FermionField tmp(this->FermionGrid());
  tmp = solution5d;
  conformable(solution5d.Grid(),this->FermionGrid());
  conformable(exported4d.Grid(),this->GaugeGrid());
  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
  ExtractSlice(exported4d, tmp, 0, 0);
 }
 // Dminus
 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
@@ -209,8 +239,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -220,8 +250,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@@ -235,8 +265,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -250,8 +280,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_plus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -266,9 +296,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass_minus*cee[Ls-1];
+      lower[s] = mass*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass_plus*cee[0];
+      upper[s] = mass*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@@ -291,8 +321,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass_plus*upper[Ls-1];
+  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass_minus*lower[0];
+  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
@@ -307,9 +337,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass_minus*cs[Ls-1];
+      lower[s] =-mass*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass_plus*cs[0];
+      upper[s] =-mass*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@@ -552,7 +582,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      leem[i]=mass_minus*cee[Ls-1]/bee[0];
+      leem[i]=mass*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@@ -560,7 +590,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      ueem[i]=mass_plus;
+      ueem[i]=mass;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
@@ -573,7 +603,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
  { 
-    Coeff_t delta_d=mass_minus*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@@ -642,10 +672,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -781,8 +807,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
@@ -834,7 +858,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -887,7 +910,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1.0; // sign flip for vector/tadpole
+  RealD sign = 1; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -897,7 +920,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1.0;    
+      sign = -1;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -941,13 +964,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    // Mask the time
+    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
      unsigned int t0 = 0;
      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
    } else {
      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    InsertSlice(L_Q, q_out, s , 0);
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -1,377 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
                                                                            GridCartesian& Fgrid,
                                                                            GridRedBlackCartesian& Hgrid,
                                                                            const RealD _mass,
                                                                            const RealD _csw_r,
                                                                            const RealD _csw_t,
                                                                            const RealD _cF,
                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
                                                                            const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
  , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
  , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
  , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
  , Tmp(&Fgrid)
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
  if (fixedBoundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->fixedBoundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
    } else {
      MooeeInternal(in, out, DiagonalEven, TriangleEven);
    }
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
    } else {
      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
    }
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!fixedBoundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
        continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  out.Checkerboard() = in.Checkerboard();
  conformable(in, out);
  conformable(in, diagonal);
  conformable(in, triangle);
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
  double t0 = usecond();
  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
  // Initialize temporary variables
  double t1 = usecond();
  conformable(_Umu.Grid(), this->GaugeGrid());
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  CloverField TmpInverse(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  double t3 = usecond();
  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
  // Instantiate the clover term
  // - In case of the standard clover the mass term is added
  // - In case of the exponential clover the clover term is exponentiated
  double t4 = usecond();
  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
  // Convert the data layout of the clover term
  double t5 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Modify the clover term at the temporal boundaries in case of open boundary conditions
  double t6 = usecond();
  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the Clover term
  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
  // TODO: For now this inversion is explictly done on the CPU
  double t7 = usecond();
  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
  // Fill the remaining clover fields
  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
  double t9 = usecond();
  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -2,13 +2,12 @@
    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-    Copyright (C) 2017 - 2022
+    Copyright (C) 2017
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -34,48 +33,9 @@
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
                                               const RealD                         _csw_r,
                                               const RealD                         _csw_t,
                                               const WilsonAnisotropyCoefficients& clover_anisotropy,
                                               const ImplParams&                   impl_p)
  : WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , CloverTerm(&Fgrid)
  , CloverTermInv(&Fgrid)
  , CloverTermEven(&Hgrid)
  , CloverTermOdd(&Hgrid)
  , CloverTermInvEven(&Hgrid)
  , CloverTermInvOdd(&Hgrid)
  , CloverTermDagEven(&Hgrid)
  , CloverTermDagOdd(&Hgrid)
  , CloverTermInvDagEven(&Hgrid)
  , CloverTermInvDagOdd(&Hgrid) {
  assert(Nd == 4); // require 4 dimensions
  if(clover_anisotropy.isAnisotropic) {
    csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
    diag_mass = 4.0 + _mass;
  }
  csw_t = _csw_t * 0.5;
  if(csw_r == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
  if(csw_t == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
  ImportGauge(_Umu);
 }
 // *NOT* EO
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -89,8 +49,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, Fermion
  out += temp;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -104,16 +64,13 @@ void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, Ferm
  out += temp;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
  double t1 = usecond();
  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  double t2 = usecond();
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@@ -122,20 +79,52 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  double t3 = usecond();
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = Helpers::fillCloverYZ(Bx) * csw_r;
+  CloverTerm  = fillCloverYZ(Bx) * csw_r;
-  CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
+  CloverTerm += fillCloverXZ(By) * csw_r;
-  CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
+  CloverTerm += fillCloverXY(Bz) * csw_r;
-  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
+  CloverTerm += fillCloverXT(Ex) * csw_t;
-  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
+  CloverTerm += fillCloverYT(Ey) * csw_t;
-  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
+  CloverTerm += fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;
-  double t4 = usecond();
+  int lvol = _Umu.Grid()->lSites();
-  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
+  int DimRep = Impl::Dimension;
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    thread_for(site, lvol, {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++){
 	      auto zz =  Qx()(j, k)(a, b);
 	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 	    }
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
      EigenInvCloverOp = EigenCloverOp.inverse();
      //std::cout << EigenInvCloverOp << std::endl;
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
    });
  }
  double t5 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -148,47 +137,37 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
  double t6 = usecond();
  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
-  CloverField *Clover;
+  CloverFieldType *Clover;
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  if (dag)
@@ -203,12 +182,12 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
-      Helpers::multCloverField(out, *Clover, in);
+      out = *Clover * in;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
+      out = adj(*Clover) * in;
    }
  }
  else
@@ -226,109 +205,29 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
-      Helpers::multCloverField(out, *Clover, in);
+      out = *Clover * in;
      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      Helpers::multCloverField(out, *Clover, in);
+      out = *Clover * in;
    }
  }
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
 template<class Impl, class CloverHelpers>
 void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
      continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 // Derivative parts
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
-template<class Impl, class CloverHelpers>
+template <class Impl>
-void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -51,9 +51,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p),
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements,p.locally_periodic,p),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements,p.locally_periodic,p), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p.locally_periodic,p), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
@@ -361,10 +361,21 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
+
  assert(  (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
 	 ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
         ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet) );
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) {
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  else 
+  }
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute ) {
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  }
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet ) {
    DhopInternalDirichletComms(st,lo,U,in,out,dag);
  }
  DhopTotalTime+=usecond();
 }
@@ -431,6 +442,30 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopComputeTime2+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalDirichletComms(StencilImpl & st, LebesgueOrder &lo,
 						       DoubledGaugeField & U,
 						       const FermionField &in, FermionField &out,int dag)
 {
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  accelerator_barrier();
  DhopComputeTime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -4,13 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2022
+Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -48,9 +47,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
    Kernels(p),
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
-    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
+    Stencil(&Fgrid, npoint, Even, directions, displacements,p.locally_periodic,p),
-    StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even
+    StencilEven(&Hgrid, npoint, Even, directions,displacements,p.locally_periodic,p),  // source is Even
-    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd
+    StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p.locally_periodic,p),  // source is Odd
    mass(_mass),
    Lebesgue(_grid),
    LebesgueEvenOdd(_cbgrid),
@@ -489,12 +488,21 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       FermionField &out, int dag)
 {
  DhopTotalTime-=usecond();
-#ifdef GRID_OMP
+
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
+  assert(  (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
 	 ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
         ||(WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet) );
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) {
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
-  else
+  }
-#endif
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute ) {
    DhopInternalSerial(st,lo,U,in,out,dag);
  }
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsDirichlet ) {
    DhopInternalDirichletComms(st,lo,U,in,out,dag);
  }
  DhopTotalTime+=usecond();
 }
@@ -563,6 +571,29 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  DhopComputeTime2+=usecond();
 };
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalDirichletComms(StencilImpl &st, LebesgueOrder &lo,
 						     DoubledGaugeField &U,
 						     const FermionField &in,
 						     FermionField &out, int dag)
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
  int len =  U.Grid()->oSites();
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
 };
 template <class Impl>
 void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
@@ -600,47 +631,11 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
+  assert(0);
  PropagatorField tmp_shifted(UGrid);
  PropagatorField g5Lg5(UGrid);
  PropagatorField R(UGrid);
  PropagatorField gmuR(UGrid);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  g5Lg5=g5*q_in_1*g5;
  tmp_shifted=Cshift(q_in_2,mu,1);
  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
  gmuR=gmu*R;
  q_out=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
  tmp_shifted=Cshift(q_in_1,mu,1);
  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
  g5Lg5=g5*g5Lg5*g5;
  R=q_in_2;
  gmuR=gmu*R;
  q_out-=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
 }
@@ -654,51 +649,9 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  auto UGrid= this->GaugeGrid();
+  assert(0);
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  PropagatorField L(UGrid);
  PropagatorField zz (UGrid);
  zz=Zero();
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  tmp = Cshift(q_in,mu,1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
  tmp = q_in *lattice_cmplx;
  tmp = Cshift(tmp,mu,-1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
  tmp = -( Utmp + gmu*Utmp );
  // Mask the time
  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
    unsigned int t0 = 0;
    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
  } else {
    tmp = where((lcoor>=tmin+tshift),tmp,zz);
  }
  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
 #ifdef GRID_SIMT
-#define LOAD_CHIMU(Ptype)		\
+#define LOAD_CHIMU(ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(Ptype)		\
+#define LOAD_CHIMU(ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_32=ref()(3)(2);}
 #define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);			\
+  permute##dir(Chi_00,Chi_00);	\
-  permute##dir(Chi_01,Chi_01);			\
+      permute##dir(Chi_01,Chi_01);\
-  permute##dir(Chi_02,Chi_02);			\
+      permute##dir(Chi_02,Chi_02);\
-  permute##dir(Chi_10,Chi_10);			\
+      permute##dir(Chi_10,Chi_10);	\
-  permute##dir(Chi_11,Chi_11);			\
+      permute##dir(Chi_11,Chi_11);\
-  permute##dir(Chi_12,Chi_12);
+      permute##dir(Chi_12,Chi_12);
 #endif
@@ -371,91 +371,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;
 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
-  {int ptype;					\
+  SE=st.GetEntry(ptype,DIR,ss);			\
-   SE=st.GetEntry(ptype,DIR,ss);		\
+  offset = SE->_offset;				\
-   auto offset = SE->_offset;			\
+  local  = SE->_is_local;			\
-   auto local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
-   auto perm   = SE->_permute;			\
+  if ( local ) {				\
-   if ( local ) {				\
+    LOAD_CHIMU(PERM);				\
-     LOAD_CHIMU(PERM);				\
+    PROJ;					\
-     PROJ;					\
+    if ( perm) {				\
-     if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
-       PERMUTE_DIR(PERM);			\
+    }						\
-     }						\
+  } else {					\
-   } else {					\
+    LOAD_CHI;					\
-     LOAD_CHI;					\
+  }						\
-   }						\
+  acceleratorSynchronise();			\
-   acceleratorSynchronise();			\
+  MULT_2SPIN(DIR);				\
-   MULT_2SPIN(DIR);				\
+  RECON;					
   RECON;					}
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
-  { SE=&st_p[DIR+8*ss];						\
+  SE=&st_p[DIR+8*ss];				\
-  auto ptype=st_perm[DIR];					\
+  ptype=st_perm[DIR];				\
-  auto offset = SE->_offset;					\
+  offset = SE->_offset;				\
-  auto local  = SE->_is_local;					\
+  local  = SE->_is_local;			\
-  auto perm   = SE->_permute;					\
+  perm   = SE->_permute;			\
-  if ( local ) {						\
+  if ( local ) {				\
-    LOAD_CHIMU(PERM);						\
+    LOAD_CHIMU(PERM);				\
-    PROJ;							\
+    PROJ;					\
-    if ( perm) {						\
+    if ( perm) {				\
-      PERMUTE_DIR(PERM);					\
+      PERMUTE_DIR(PERM);			\
-    }								\
+    }						\
-  } else {							\
+  } else {					\
-    LOAD_CHI;							\
+    LOAD_CHI;					\
-  }								\
+  }						\
-  acceleratorSynchronise();					\
+  acceleratorSynchronise();			\
-  MULT_2SPIN(DIR);						\
+  MULT_2SPIN(DIR);				\
-  RECON;					}
+  RECON;					
 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  { SE=&st_p[DIR+8*ss];							\
+  SE=&st_p[DIR+8*ss];							\
-    auto ptype=st_perm[DIR];						\
+  ptype=st_perm[DIR];							\
-    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+ /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-    auto offset = SE->_offset;						\
+  offset = SE->_offset;				\
-    auto perm   = SE->_permute;						\
+  perm   = SE->_permute;			\
-    LOAD_CHIMU(PERM);							\
+  LOAD_CHIMU(PERM);				\
-    PROJ;								\
+  PROJ;						\
-    MULT_2SPIN(DIR);							\
+  MULT_2SPIN(DIR);				\
-    RECON;					}
+  RECON;					
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  { int ptype;						\
+  SE=st.GetEntry(ptype,DIR,ss);			\
-  SE=st.GetEntry(ptype,DIR,ss);				\
+  offset = SE->_offset;				\
-  auto offset = SE->_offset;					\
+  local  = SE->_is_local;			\
-  auto local  = SE->_is_local;					\
+  perm   = SE->_permute;			\
-  auto perm   = SE->_permute;					\
+  if ( local ) {				\
-  if ( local ) {						\
+    LOAD_CHIMU(PERM);				\
-    LOAD_CHIMU(PERM);						\
+    PROJ;					\
-    PROJ;							\
+    if ( perm) {				\
-    if ( perm) {						\
+      PERMUTE_DIR(PERM);			\
-      PERMUTE_DIR(PERM);					\
+    }						\
-    }								\
+  } else if ( st.same_node[DIR] ) {		\
-  } else if ( st.same_node[DIR] ) {				\
+    LOAD_CHI;					\
-    LOAD_CHI;							\
+  }						\
-  }								\
+  acceleratorSynchronise();			\
-  acceleratorSynchronise();					\
+  if (local || st.same_node[DIR] ) {		\
-  if (local || st.same_node[DIR] ) {				\
+    MULT_2SPIN(DIR);				\
-    MULT_2SPIN(DIR);						\
+    RECON;					\
-    RECON;							\
+  }						\
-  }								\
+  acceleratorSynchronise();			
  acceleratorSynchronise();			}
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  { int ptype;						\
+  SE=st.GetEntry(ptype,DIR,ss);			\
-  SE=st.GetEntry(ptype,DIR,ss);				\
+  offset = SE->_offset;				\
-  auto offset = SE->_offset;				\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
+    LOAD_CHI;					\
-    LOAD_CHI;						\
+    MULT_2SPIN(DIR);				\
-    MULT_2SPIN(DIR);					\
+    RECON;					\
-    RECON;						\
+    nmu++;					\
-    nmu++;						\
+  }						\
-  }							\
+  acceleratorSynchronise();			
  acceleratorSynchronise();			}
-#define HAND_RESULT(ss)					\
+#define HAND_RESULT(ss)				\
-  {							\
+  {						\
-    SiteSpinor & ref (out[ss]);				\
+    SiteSpinor & ref (out[ss]);			\
    coalescedWrite(ref()(0)(0),result_00,lane);		\
    coalescedWrite(ref()(0)(1),result_01,lane);		\
    coalescedWrite(ref()(0)(2),result_02,lane);		\
@@ -566,6 +563,7 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@@ -595,7 +593,9 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -623,6 +623,8 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -638,8 +640,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  //  auto st_p = st._entries_p;						
+  auto st_p = st._entries_p;						
-  //  auto st_perm = st._permute_type;					
+  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -650,6 +652,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -667,8 +670,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  //  auto st_p = st._entries_p;						
+  auto st_p = st._entries_p;						
-  //  auto st_perm = st._permute_type;					
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -679,6 +682,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -695,8 +699,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  //  auto st_p = st._entries_p;						
+  auto st_p = st._entries_p;						
-  //  auto st_perm = st._permute_type;					
+  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -707,7 +711,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
-  //  int offset, ptype;
+  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -726,8 +730,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  //  auto st_p = st._entries_p;						
+  auto st_p = st._entries_p;						
-  //  auto st_perm = st._permute_type;					
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -738,7 +742,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
-  //  int offset, ptype;
+  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -498,7 +498,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
     acceleratorFenceComputeStream();
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
@@ -506,13 +505,11 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
     acceleratorFenceComputeStream();
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
@@ -8,7 +8,6 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -32,12 +31,10 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
+template class WilsonCloverFermion<IMPLEMENTATION>; 
 template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc
@@ -1 +0,0 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc
@@ -1 +0,0 @@
 ../CompactWilsonCloverFermionInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@@ -1 +0,0 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@@ -0,0 +1,51 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -18,10 +18,6 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "
 COMPACT_WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD "
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@@ -54,16 +50,6 @@ do
 done
 done
 CC_LIST="CompactWilsonCloverFermionInstantiation"
 for impl in $COMPACT_WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
 CC_LIST=" \
  CayleyFermion5DInstantiation \
  ContinuedFractionFermion5DInstantiation \
--- a/Show More
+++ b/Show More
		`@@ -1 +0,0 @@`
			`../CompactWilsonCloverFermionInstantiation.cc.master`