Merge 32e6d58356 into ffd7301649

Updated masked / fthmc smeared config container
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
2026-07-23 20:13:27 +01:00 · 2023-06-02 01:59:29 -07:00 · 2023-06-01 06:23:02 -04:00 · 2023-06-01 06:22:33 -04:00 · 2023-06-01 06:15:08 -04:00 · 2023-06-01 06:14:28 -04:00
445 changed files with 47142 additions and 15939 deletions
@@ -0,0 +1,54 @@
 name: Bug report
 description: Report a bug.
 title: "<insert title>"
 labels: [bug]
 body:
  - type: markdown
    attributes:
      value: >
        Thank you for taking the time to file a bug report.
        Please check that the code is pointing to the HEAD of develop
        or any commit in master which is tagged with a version number.
  - type: textarea
    attributes:
      label: "Describe the issue:"
      description: >
        Describe the issue and any previous attempt to solve it.
    validations:
      required: true
  - type: textarea
    attributes:
      label: "Code example:"
      description: >
        If relevant, show how to reproduce the issue using a minimal working
        example.
      placeholder: |
        << your code here >>
      render: shell
    validations:
      required: false
  - type: textarea
    attributes:
      label: "Target platform:"
      description: >
        Give a description of the target platform (CPU, network, compiler).
        Please give the full CPU part description, using for example
        `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
        or `sysctl machdep.cpu.brand_string` (macOS) and the full output
        the `--version` option of your compiler.
    validations:
      required: true
  - type: textarea
    attributes:
      label: "Configure options:"
      description: >
        Please give the exact configure command used and attach
        `config.log`, `grid.config.summary` and the output of `make V=1`.
      render: shell
    validations:
      required: true
@@ -44,14 +44,22 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
 #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-
+#endif
 //Eigen only
 #endif
 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
@@ -44,9 +44,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
-#include <Grid/perfmon/PerfCount.h>
+//#include <Grid/perfmon/PerfCount.h>
 #include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
 #include <Grid/perfmon/Tracing.h>
 #include <Grid/allocator/Allocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/ThreadReduction.h>
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
@@ -16,6 +16,7 @@
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
@@ -14,7 +14,11 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
 #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
 #endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
@@ -54,6 +54,8 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -262,7 +262,7 @@ public:
 	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
-	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
@@ -324,9 +324,9 @@ public:
  GridBase*        _cbgrid;
  int hermitian;
-  CartesianStencil<siteVector,siteVector,int> Stencil; 
+  CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil; 
-  CartesianStencil<siteVector,siteVector,int> StencilEven;
+  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
-  CartesianStencil<siteVector,siteVector,int> StencilOdd;
+  CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
  std::vector<CoarseMatrix> A;
  std::vector<CoarseMatrix> Aeven;
@@ -358,7 +358,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -380,7 +380,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int point=0;point<geom_v.npoint;point++){
+      for(int point=0;point<npoint;point++){
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@@ -424,7 +424,7 @@ public:
    autoView( in_v , in, AcceleratorRead);
    autoView( out_v , out, AcceleratorWrite);
    autoView( Stencil_v  , Stencil, AcceleratorRead);
-    auto& geom_v = geom;
+    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -454,7 +454,7 @@ public:
      int ptype;
      StencilEntry *SE;
-      for(int p=0;p<geom_v.npoint;p++){
+      for(int p=0;p<npoint;p++){
        int point = points_p[p];
 	SE=Stencil_v.GetEntry(ptype,point,ss);
@@ -631,7 +631,7 @@ public:
    assert(Aself != nullptr);
  }
-  void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
+  void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
                       const CoarseVector &in, CoarseVector &out, int dag) {
    int point = geom.npoint-1;
    autoView( out_v, out, AcceleratorWrite);
@@ -694,7 +694,7 @@ public:
    }
  }
-  void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
+  void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
                    const CoarseVector &in, CoarseVector &out, int dag) {
    SimpleCompressor<siteVector> compressor;
@@ -784,9 +784,9 @@ public:
    _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,_cbgrid),
    Aodd(geom.npoint,_cbgrid),
@@ -804,9 +804,9 @@ public:
    _cbgrid(&CoarseRBGrid),
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
-    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
+    StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
-    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
+    StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
    A(geom.npoint,&CoarseGrid),
    Aeven(geom.npoint,&CoarseRBGrid),
    Aodd(geom.npoint,&CoarseRBGrid),
@@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
  virtual ~LinearOperatorBase(){};
 };
@@ -507,7 +508,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  virtual  void MpcDag   (const Field &in, Field &out){
    Mpc(in,out);
  }
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out) {
    assert(0);// Never need with staggered
  }
 };
@@ -525,6 +526,7 @@ public:
      (*this)(Linop,in[k],out[k]);
    }
  };
  virtual ~OperatorFunction(){};
 };
 template<class Field> class LinearFunction {
@@ -540,6 +542,7 @@ public:
      (*this)(in[i], out[i]);
    }
  }
  virtual ~LinearFunction(){};
 };
 template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
@@ -585,6 +588,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 template<typename Field>
 class PlainHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  LinearOperatorBase<Field> &_Linop;
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
@@ -598,6 +602,7 @@ public:
 template<typename Field>
 class FunctionHermOp : public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator(); 
  OperatorFunction<Field>   & _poly;
  LinearOperatorBase<Field> &_Linop;
@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template<class Field> using Preconditioner =  LinearFunction<Field> ;
 /*
 template<class Field> class Preconditioner :  public LinearFunction<Field> {
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field & psi)=0;
 };
 */
 template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 public:
-  void operator()(const Field &src, Field & psi){
+  using Preconditioner<Field>::operator();
  virtual void operator()(const Field &src, Field & psi){
    psi = src;
  }
  TrivialPrecon(void){};
@@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
  virtual ~SparseMatrixBase() {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };
 NAMESPACE_END(Grid);
@@ -258,26 +258,12 @@ public:
    for(int n=2;n<order;n++){
      Linop.HermOp(*Tn,y);
 #if 0
      auto y_v = y.View();
      auto Tn_v = Tn->View();
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
 #else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
-#endif
+
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
@@ -37,6 +37,7 @@ template<class FieldD, class FieldF, typename std::enable_if< getPrecision<Field
 class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 {
  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -58,6 +58,7 @@ public:
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
    GRID_TRACE("ConjugateGradient");
    psi.Checkerboard() = src.Checkerboard();
    conformable(psi, src);
@@ -117,9 +118,13 @@ public:
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      GridStopWatch IterationTimer;
      IterationTimer.Start();
      c = cp;
      MatrixTimer.Start();
@@ -152,31 +157,41 @@ public:
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
-      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+      IterationTimer.Stop();
      if ( (k % 500) == 0 ) {
 	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
      } else { 
 	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
 		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
      }
      // Stopping condition
      if (cp <= rsq) {
 	usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
-
+	GridBase *grid = src.Grid();
 	RealD DwfFlops = (1452. )*grid->gSites()*4*k
   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
 		  << "\tComputed residual " << std::sqrt(cp / ssq)
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;
-        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -48,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    RealD TrueResidual;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
@@ -67,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
    }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
    TotalInnerIterations = 0;
    GridStopWatch TotalTimer;
@@ -96,6 +99,7 @@ NAMESPACE_BEGIN(Grid);
    FieldF sol_f(SinglePrecGrid);
    sol_f.Checkerboard() = cb;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
@@ -105,6 +109,9 @@ NAMESPACE_BEGIN(Grid);
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
    precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
@@ -119,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
      PrecChangeTimer.Start();
-      precisionChange(src_f, src_d);
+      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
      PrecChangeTimer.Stop();
      sol_f = Zero();
@@ -129,6 +136,7 @@ NAMESPACE_BEGIN(Grid);
 	(*guesser)(src_f, sol_f);
      //Inner CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
      CG_f.Tolerance = inner_tol;
      InnerCGtimer.Start();
      CG_f(Linop_f, src_f, sol_f);
@@ -137,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
      //Convert sol back to double and add to double prec solution
      PrecChangeTimer.Start();
-      precisionChange(tmp_d, sol_f);
+      precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
      PrecChangeTimer.Stop();
      axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -149,6 +157,7 @@ NAMESPACE_BEGIN(Grid);
    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
    CG_d(Linop_d, src_d_in, sol_d);
    TotalFinalStepIterations = CG_d.IterationsToComplete;
    TrueResidual = CG_d.TrueResidual;
    TotalTimer.Stop();
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
@@ -0,0 +1,213 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
    Copyright (C) 2015
    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 NAMESPACE_BEGIN(Grid);
 //Mixed precision restarted defect correction CG
 template<class FieldD,class FieldF, 
  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
 public:
  using LinearFunction<FieldD>::operator();
  RealD   Tolerance;
  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
  Integer MaxInnerIterations;
  Integer MaxOuterIterations;
  Integer MaxPatchupIterations;
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
  LinearFunction<FieldF> *guesser;
  bool updateResidual;
  MixedPrecisionConjugateGradientBatched(RealD tol, 
          Integer maxinnerit, 
          Integer maxouterit, 
          Integer maxpatchit,
          GridBase* _sp_grid, 
          LinearOperatorBase<FieldF> &_Linop_f, 
          LinearOperatorBase<FieldD> &_Linop_d,
          bool _updateResidual=true) :
    Linop_f(_Linop_f), Linop_d(_Linop_d),
    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
  void useGuesser(LinearFunction<FieldF> &g){
    guesser = &g;
  }
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
    std::vector<FieldD> srcs_d_in{src_d_in};
    std::vector<FieldD> sols_d{sol_d};
    (*this)(srcs_d_in,sols_d);
    sol_d = sols_d[0];
  }
  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
    assert(src_d_in.size() == sol_d.size());
    int NBatch = src_d_in.size();
    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
    Integer TotalOuterIterations = 0; //Number of restarts
    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
    GridStopWatch TotalTimer;
    TotalTimer.Start();
    GridStopWatch InnerCGtimer;
    GridStopWatch PrecChangeTimer;
    int cb = src_d_in[0].Checkerboard();
    std::vector<RealD> src_norm;
    std::vector<RealD> norm;
    std::vector<RealD> stop;
    GridBase* DoublePrecGrid = src_d_in[0].Grid();
    FieldD tmp_d(DoublePrecGrid);
    tmp_d.Checkerboard() = cb;
    FieldD tmp2_d(DoublePrecGrid);
    tmp2_d.Checkerboard() = cb;
    std::vector<FieldD> src_d;
    std::vector<FieldF> src_f;
    std::vector<FieldF> sol_f;
    for (int i=0; i<NBatch; i++) {
      sol_d[i].Checkerboard() = cb;
      src_norm.push_back(norm2(src_d_in[i]));
      norm.push_back(0.);
      stop.push_back(src_norm[i] * Tolerance*Tolerance);
      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
      src_f.push_back(SinglePrecGrid);
      src_f[i].Checkerboard() = cb;
      sol_f.push_back(SinglePrecGrid);
      sol_f[i].Checkerboard() = cb;
    }
    RealD inner_tol = InnerTolerance;
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
    CG_f.ErrorOnNoConverge = false;
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      std::cout << GridLogMessage << std::endl;
      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
      bool allConverged = true;
      for (int i=0; i<NBatch; i++) {
        //Compute double precision rsd and also new RHS vector.
        Linop_d.HermOp(sol_d[i], tmp_d);
        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
        PrecChangeTimer.Start();
        precisionChange(src_f[i], src_d[i]);
        PrecChangeTimer.Stop();
        sol_f[i] = Zero();
        if(norm[i] > OuterLoopNormMult * stop[i]) {
          allConverged = false;
        }
      }
      if (allConverged) break;
      if (updateResidual) {
        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
        CG_f.Tolerance = inner_tol;
      }
      //Optionally improve inner solver guess (eg using known eigenvectors)
      if(guesser != NULL) {
        (*guesser)(src_f, sol_f);
      }
      for (int i=0; i<NBatch; i++) {
        //Inner CG
        InnerCGtimer.Start();
        CG_f(Linop_f, src_f[i], sol_f[i]);
        InnerCGtimer.Stop();
        TotalInnerIterations[i] += CG_f.IterationsToComplete;
        //Convert sol back to double and add to double prec solution
        PrecChangeTimer.Start();
        precisionChange(tmp_d, sol_f[i]);
        PrecChangeTimer.Stop();
        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
      }
    }
    //Final trial CG
    std::cout << GridLogMessage << std::endl;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
    for (int i=0; i<NBatch; i++) {
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
      CG_d(Linop_d, src_d_in[i], sol_d[i]);
      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
    }
    TotalTimer.Stop();
    std::cout << GridLogMessage << std::endl;
    for (int i=0; i<NBatch; i++) {
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
    }
    std::cout << GridLogMessage << std::endl;
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
  }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -44,7 +44,7 @@ public:
  using OperatorFunction<Field>::operator();
-  RealD   Tolerance;
+  //  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -52,7 +52,7 @@ public:
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
-  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+  ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
@@ -84,6 +84,7 @@ public:
  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
  {
    GRID_TRACE("ConjugateGradientMultiShift");
    GridBase *grid = src.Grid();
@@ -183,6 +184,9 @@ public:
      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
    }
    std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
@@ -322,7 +326,7 @@ public:
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
      IterationsToComplete = k;	
@@ -0,0 +1,373 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //PB Pure single, then double fixup
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq) : 
    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
    GridBase *DoublePrecGrid = src_d.Grid();
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
    std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldF r_f(SinglePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  rsqf[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF p_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	psi_f[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      //      ps_d[s] = src_d;
      precisionChange(ps_f[s],src_d);
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    //MdagM+m[0]
    precisionChange(p_f,p_d);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    precisionChange(tmp_d,mmp_f);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
    //    assert(norm2(tmp_d)< 1.0e-4);
    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_d,b,mmp_d,r_d);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
      precisionChange(psi_f[s],psi_d[s]);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterationsMshift;k++){    
      a = c /cp;
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(r_f, r_d);
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_f[s],a,ps_f[s],r_f);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      cp=c;
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
      precisionChange(mmp_d, mmp_f); // From Float to Double
      PrecChangeTimer.Stop();
      d=real(innerProduct(p_d,mmp_d));    
      axpy(mmp_d,mass[0],p_d,mmp_d);
      RealD rn = norm2(p_d);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update single precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
 	}
      }
      c = axpy_norm(r_d,b,mmp_d,r_d);
      AXPYTimer.Stop();
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsqf[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged || k == MaxIterationsMshift-1){
 	SolverTimer.Stop();
 	for(int s=0;s<nshift;s++){
 	  precisionChange(psi_d[s],psi_f[s]);
 	}
 	if ( all_converged ){
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
 	}
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,416 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christopher Kelly <ckelly@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
 NAMESPACE_BEGIN(Grid);
 //CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. 
 //The residual is stored in single precision, but the search directions and solution are stored in double precision. 
 //Every update_freq iterations the residual is corrected in double precision. 
 //For safety the a final regular CG is applied to clean up if necessary
 //Linop to add shift to input linop, used in cleanup CG
 namespace ConjugateGradientMultiShiftMixedPrecSupport{
 template<typename Field>
 class ShiftedLinop: public LinearOperatorBase<Field>{
 public:
  LinearOperatorBase<Field> &linop_base;
  RealD shift;
  ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
  void OpDiag (const Field &in, Field &out){ assert(0); }
  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
  void Op     (const Field &in, Field &out){ assert(0); }
  void AdjOp  (const Field &in, Field &out){ assert(0); }
  void HermOp(const Field &in, Field &out){
    linop_base.HermOp(in, out);
    axpy(out, shift, in, out);
  }    
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
 };
 };
 template<class FieldD, class FieldF,
 	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
 					     public OperatorFunction<FieldD>
 {
 public:                                                
  using OperatorFunction<FieldD>::operator();
  RealD   Tolerance;
  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
  std::vector<RealD> TrueResidualShift;
  int ReliableUpdateFreq; //number of iterations between reliable updates
  GridBase* SinglePrecGrid; //Grid for single-precision fields
  LinearOperatorBase<FieldF> &Linop_f; //single precision
  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
 				       int _ReliableUpdateFreq) : 
    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
    TrueResidualShift.resize(_shifts.order);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
  {
    GridBase *grid = src.Grid();
    int nshift = shifts.order;
    std::vector<FieldD> results(nshift,grid);
    (*this)(Linop,src,results,psi);
  }
  void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
  {
    int nshift = shifts.order;
    (*this)(Linop,src,results);
    psi = shifts.norm*src;
    for(int i=0;i<nshift;i++){
      psi = psi + shifts.residues[i]*results[i];
    }
    return;
  }
  void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
  { 
    GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
    GridBase *DoublePrecGrid = src_d.Grid();
    precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
    precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
    ////////////////////////////////////////////////////////////////////////
    // Convenience references to the info stored in "MultiShiftFunction"
    ////////////////////////////////////////////////////////////////////////
    int nshift = shifts.order;
    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
    std::vector<RealD> &mresidual(shifts.tolerances);
    std::vector<RealD> alpha(nshift,1.0);
    //Double precision search directions
    FieldD p_d(DoublePrecGrid);
    std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
    FieldD tmp_d(DoublePrecGrid);
    FieldD r_d(DoublePrecGrid);
    FieldD mmp_d(DoublePrecGrid);
    assert(psi_d.size()==nshift);
    assert(mass.size()==nshift);
    assert(mresidual.size()==nshift);
    // dynamic sized arrays on stack; 2d is a pain with vector
    RealD  bs[nshift];
    RealD  rsq[nshift];
    RealD  rsqf[nshift];
    RealD  z[nshift][2];
    int     converged[nshift];
    const int       primary =0;
    //Primary shift fields CG iteration
    RealD a,b,c,d;
    RealD cp,bp,qq; //prev
    // Matrix mult fields
    FieldF p_f(SinglePrecGrid);
    FieldF mmp_f(SinglePrecGrid);
    // Check lightest mass
    for(int s=0;s<nshift;s++){
      assert( mass[s]>= mass[primary] );
      converged[s]=0;
    }
    // Wire guess to zero
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src_d);
    // Handle trivial case of zero src.
    if( cp == 0. ){
      for(int s=0;s<nshift;s++){
 	psi_d[s] = Zero();
 	IterationsToCompleteShift[s] = 1;
 	TrueResidualShift[s] = 0.;
      }
      return;
    }
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      rsqf[s] =rsq[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
      ps_d[s] = src_d;
    }
    // r and p for primary
    p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
    r_d = p_d;
    //MdagM+m[0]
    precisionChange(p_f, p_d, pc_wk_d_to_s);
    Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
    Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
    tmp_d = tmp_d - mmp_d;
    std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
    assert(norm2(tmp_d)< 1.0);
    axpy(mmp_d,mass[0],p_d,mmp_d);
    RealD rn = norm2(p_d);
    d += rn*mass[0];
    b = -cp /d;
    // Set up the various shift variables
    int       iz=0;
    z[0][1-iz] = 1.0;
    z[0][iz]   = 1.0;
    bs[0]      = b;
    for(int s=1;s<nshift;s++){
      z[s][1-iz] = 1.0;
      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
      bs[s]      = b*z[s][iz]; 
    }
    // r += b[0] A.p[0]
    // c= norm(r)
    c=axpy_norm(r_d,b,mmp_d,r_d);
    for(int s=0;s<nshift;s++) {
      axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
    }
    ///////////////////////////////////////
    // Timers
    ///////////////////////////////////////
    GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
    SolverTimer.Start();
    // Iteration loop
    int k;
    for (k=1;k<=MaxIterationsMshift;k++){    
      a = c /cp;
      AXPYTimer.Start();
      axpy(p_d,a,p_d,r_d); 
      for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
 	    axpy(ps_d[s],a,ps_d[s],r_d);
 	  } else{
 	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	    axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
 	  }
 	}
      }
      AXPYTimer.Stop();
      PrecChangeTimer.Start();
      precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
      PrecChangeTimer.Stop();
      cp=c;
      MatrixTimer.Start();  
      Linop_f.HermOp(p_f,mmp_f);
      MatrixTimer.Stop();  
      PrecChangeTimer.Start();
      precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
      PrecChangeTimer.Stop();
      AXPYTimer.Start();
      d=real(innerProduct(p_d,mmp_d));    
      axpy(mmp_d,mass[0],p_d,mmp_d);
      AXPYTimer.Stop();
      RealD rn = norm2(p_d);
      d += rn*mass[0];
      bp=b;
      b=-cp/d;
      // Toggle the recurrence history
      bs[0] = b;
      iz = 1-iz;
      ShiftTimer.Start();
      for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
 	  RealD z1 = z[s][iz];
 	  z[s][iz] = z0*z1*bp
 	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
      }
      ShiftTimer.Stop();
      //Update double precision solutions
      AXPYTimer.Start();
      for(int s=0;s<nshift;s++){
 	int ss = s;
 	if( (!converged[s]) ) { 
 	  axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
 	}
      }
      //Perform reliable update if necessary; otherwise update residual from single-prec mmp
      c = axpy_norm(r_d,b,mmp_d,r_d);
      AXPYTimer.Stop();
      if(k % ReliableUpdateFreq == 0){
 	RealD c_old = c;
 	//Replace r with true residual
 	MatrixTimer.Start();  
 	Linop_d.HermOp(psi_d[0],mmp_d); 
 	MatrixTimer.Stop();  
 	AXPYTimer.Start();
 	axpy(mmp_d,mass[0],psi_d[0],mmp_d);
 	c = axpy_norm(r_d, -1.0, mmp_d, src_d);
 	AXPYTimer.Stop();
 	std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
      }
      // Convergence checks
      int all_converged = 1;
      for(int s=0;s<nshift;s++){
 	if ( (!converged[s]) ){
 	  IterationsToCompleteShift[s] = k;
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	  if(css<rsqf[s]){
 	    if ( ! converged[s] )
 	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	    converged[s]=1;
 	  } else {
 	    all_converged=0;
 	  }
 	}
      }
      if ( all_converged || k == MaxIterationsMshift-1){
 	SolverTimer.Stop();
 	if ( all_converged ){
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
 	}
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
 	  axpy(tmp_d,mass[s],psi_d[s],mmp_d);
 	  axpy(r_d,-alpha[s],src_d,tmp_d);
 	  RealD rn = norm2(r_d);
 	  RealD cn = norm2(src_d);
 	  TrueResidualShift[s] = std::sqrt(rn/cn);
 	  std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
 	  //If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
 	  if(rn >= rsq[s]){
 	    CleanupTimer.Start();
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
 	    //Setup linear operators for final cleanup
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
 	    ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
 	    MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); 
 	    cg(src_d, psi_d[s]);
 	    TrueResidualShift[s] = cg.TrueResidual;
 	    CleanupTimer.Stop();
 	  }
 	}
 	std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
 	std::cout << GridLogMessage << "\tSolver    " << SolverTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tMatrix    " << MatrixTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed()     <<std::endl;
 	std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
 	IterationsToComplete = k;	
 	return;
      }
    }
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
    assert(0);
  }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -48,7 +48,7 @@ public:
  LinearOperatorBase<FieldF> &Linop_f;
  LinearOperatorBase<FieldD> &Linop_d;
  GridBase* SinglePrecGrid;
-  RealD Delta; //reliable update parameter
+  RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
  LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,7 +65,9 @@ public:
      ErrorOnNoConverge(err_on_no_conv),
      DoFinalCleanup(true),
      Linop_fallback(NULL)
-  {};
+  {
    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
  };
  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
    Linop_fallback = &_Linop_fallback;
@@ -73,6 +75,7 @@ public:
  }
  void operator()(const FieldD &src, FieldD &psi) {
    GRID_TRACE("ConjugateGradientReliableUpdate");
    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
    bool using_fallback = false;
@@ -115,9 +118,12 @@ public:
    }
    //Single prec initialization
    precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
    precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
    FieldF r_f(SinglePrecGrid);
    r_f.Checkerboard() = r.Checkerboard();
-    precisionChange(r_f, r);
+    precisionChange(r_f, r, pc_wk_dp_to_sp);
    FieldF psi_f(r_f);
    psi_f = Zero();
@@ -133,6 +139,7 @@ public:
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    GridStopWatch PrecChangeTimer;
    SolverTimer.Start();
    int k = 0;
@@ -172,7 +179,9 @@ public:
      // Stopping condition
      if (cp <= rsq) {
 	//Although not written in the paper, I assume that I have to add on the final solution
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
 	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 	PrecChangeTimer.Stop();
 	psi = psi + mmp;
@@ -193,6 +202,9 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
 	IterationsToComplete = k;	
 	ReliableUpdatesPerformed = l;
@@ -213,14 +225,21 @@ public:
      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
-	precisionChange(mmp, psi_f);
+	PrecChangeTimer.Start();
 	precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
 	PrecChangeTimer.Stop();
 	psi = psi + mmp;
 	MatrixTimer.Start();
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	MatrixTimer.Stop();
 	r = src - mmp;
 	psi_f = Zero();
-	precisionChange(r_f, r);
+	PrecChangeTimer.Start();
 	precisionChange(r_f, r, pc_wk_dp_to_sp);
 	PrecChangeTimer.Stop();
 	cp = norm2(r);
 	MaxResidSinceLastRelUp = cp;
@@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
@@ -57,6 +60,7 @@ private:
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@@ -87,6 +91,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
@@ -108,7 +113,43 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-};
+
  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
    int Nevec = (int)evec_coarse.size();
    int Nsrc = (int)src.size();
    // make temp variables
    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
    //Preporcessing
    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    guess_coarse[j] = Zero();
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockProject(src_coarse[j],src[j],subspace);
    }
    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
    for (int i=0;i<Nevec;i++)
    {
      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
      const CoarseField & tmp = evec_coarse[i];
      for (int j=0;j<Nsrc;j++)
      {
        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
      }
    }
    //postprocessing
    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockPromote(guess_coarse[j],guess[j],subspace);
    guess[j].Checkerboard() = src[j].Checkerboard();
    }
  };
  };
@@ -44,6 +44,7 @@ public:
 				  int, MinRes);    // Must restart
 };
 //This class is the input parameter class for some testing programs
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -67,6 +68,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -97,6 +99,7 @@ public:
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
@@ -144,15 +147,23 @@ public:
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3) 
+					   RealD coarse_relax_tol=5.0e3,
 					   int largestEvalIdxForReport=-1) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol)  
+      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
  {    };
  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
@@ -175,12 +186,26 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
      RealD tmp_eval;
      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
    }
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
  //This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
  //applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
  //evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
  //As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
  //We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)  
  {
    evalMaxApprox = 1.0; //cf above
    GridBase *FineGrid = _subspace[0].Grid();    
    int checkerboard   = _subspace[0].Checkerboard();
    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -199,13 +224,13 @@ public:
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -283,6 +308,10 @@ public:
    evals_coarse.resize(0);
  };
  //The block inner product is the inner product on the fine grid locally summed over the blocks
  //to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
  //vectors under the block inner product. This step must be performed after computing the fine grid
  //eigenvectors and before computing the coarse grid eigenvectors.    
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -326,6 +355,8 @@ public:
    }
  }
  //While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
  //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
@@ -374,25 +405,31 @@ public:
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  //cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
  //cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
  //relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
-    Chebyshev<FineField>                          Cheby(cheby_op);
+    Chebyshev<FineField>                          Cheby(cheby_op); //Chebyshev of fine operator on fine grid
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
-    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -403,6 +440,14 @@ public:
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
  //Get the fine eigenvector 'i' by reconstruction
  void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
    blockPromote(evec_coarse[i],evec,subspace);  
    eval = evals_coarse[i];
  }
 };
 NAMESPACE_END(Grid);
@@ -30,6 +30,8 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
 	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 public:                                                
-
+  using LinearFunction<Field>::operator();
  RealD   Tolerance;
  Integer MaxIterations;
  int verbose;
@@ -119,7 +119,8 @@ public:
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
    RealD cp;
-    ComplexD a, b, zAz;
+    ComplexD a, b;
    //    ComplexD zAz;
    RealD zAAz;
    ComplexD rq;
@@ -146,7 +147,7 @@ public:
    //////////////////////////////////
    MatTimer.Start();
    Linop.Op(psi,Az);
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    MatTimer.Stop();
@@ -170,7 +171,7 @@ public:
    LinalgTimer.Start();
-    zAz = innerProduct(Az,psi);
+    //    zAz = innerProduct(Az,psi);
    zAAz= norm2(Az);
    //p[0],q[0],qq[0] 
@@ -212,7 +213,7 @@ public:
      MatTimer.Start();
      Linop.Op(z,Az);
      MatTimer.Stop();
-      zAz = innerProduct(Az,psi);
+      //      zAz = innerProduct(Az,psi);
      zAAz= norm2(Az);
      LinalgTimer.Start();
@@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
 /*Allocation types, saying which pointer cache should be used*/
 #define Cpu      (0)
-#define CpuSmall (1)
+#define CpuHuge  (1)
-#define Acc      (2)
+#define CpuSmall (2)
-#define AccSmall (3)
+#define Acc      (3)
-#define Shared   (4)
+#define AccHuge  (4)
-#define SharedSmall (5)
+#define AccSmall (5)
 #define Shared   (6)
 #define SharedHuge  (7)
 #define SharedSmall (8)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
@@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
 }
 uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
 uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
 uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@@ -159,7 +165,6 @@ void MemoryManager::Init(void)
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
@@ -171,6 +176,16 @@ void MemoryManager::Init(void)
    }
  }
  str= getenv("GRID_ALLOC_NCACHE_HUGE");
  if ( str ) {
    Nc = atoi(str);
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
      Ncache[CpuHuge]=Nc;
      Ncache[AccHuge]=Nc;
      Ncache[SharedHuge]=Nc;
    }
  }
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
  if ( str ) {
    Nc = atoi(str);
@@ -191,7 +206,9 @@ void MemoryManager::InitMessage(void) {
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 #ifdef ALLOCATION_CACHE
-  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
+  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
 #endif
 #ifdef GRID_UVM
@@ -223,8 +240,11 @@ void MemoryManager::InitMessage(void) {
 void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache;
-  int cache = type + small;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
  else                                     cache = type;
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
@@ -233,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
  if (ncache == 0) return ptr;
  void * ret = NULL;
  int v = -1;
@@ -272,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 void *MemoryManager::Lookup(size_t bytes,int type)
 {
 #ifdef ALLOCATION_CACHE
-  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
+  int cache;
-  int cache = type+small;
+  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
  else                                     cache = type;
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
@@ -282,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
@@ -35,6 +35,12 @@ NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
 #define GRID_ALLOC_SMALL_LIMIT (4096)
 #define GRID_ALLOC_HUGE_LIMIT  (2147483648)
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 #define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
 #define AUDIT(a) MemoryManager::Audit(FILE_LINE)
 /*Pinning pages is costly*/
 ////////////////////////////////////////////////////////////////////////////
@@ -65,6 +71,21 @@ enum ViewMode {
  CpuWriteDiscard = 0x10 // same for now
 };
 struct MemoryStatus {
  uint64_t     DeviceBytes;
  uint64_t     DeviceLRUBytes;
  uint64_t     DeviceMaxBytes;
  uint64_t     HostToDeviceBytes;
  uint64_t     DeviceToHostBytes;
  uint64_t     HostToDeviceXfer;
  uint64_t     DeviceToHostXfer;
  uint64_t     DeviceEvictions;
  uint64_t     DeviceDestroy;
  uint64_t     DeviceAllocCacheBytes;
  uint64_t     HostAllocCacheBytes;
 };
 class MemoryManager {
 private:
@@ -78,7 +99,7 @@ private:
  } AllocationCacheEntry;
  static const int NallocCacheMax=128; 
-  static const int NallocType=6;
+  static const int NallocType=9;
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
@@ -92,8 +113,9 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
  static void PrintBytes(void);
 public:
  static void PrintBytes(void);
  static void Audit(std::string s);
  static void Init(void);
  static void InitMessage(void);
  static void *AcceleratorAllocate(size_t bytes);
@@ -113,6 +135,27 @@ private:
  static uint64_t     DeviceToHostBytes;
  static uint64_t     HostToDeviceXfer;
  static uint64_t     DeviceToHostXfer;
  static uint64_t     DeviceEvictions;
  static uint64_t     DeviceDestroy;
  static uint64_t     DeviceCacheBytes();
  static uint64_t     HostCacheBytes();
  static MemoryStatus GetFootprint(void) {
    MemoryStatus stat;
    stat.DeviceBytes       = DeviceBytes;
    stat.DeviceLRUBytes    = DeviceLRUBytes;
    stat.DeviceMaxBytes    = DeviceMaxBytes;
    stat.HostToDeviceBytes = HostToDeviceBytes;
    stat.DeviceToHostBytes = DeviceToHostBytes;
    stat.HostToDeviceXfer  = HostToDeviceXfer;
    stat.DeviceToHostXfer  = DeviceToHostXfer;
    stat.DeviceEvictions   = DeviceEvictions;
    stat.DeviceDestroy     = DeviceDestroy;
    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
    stat.HostAllocCacheBytes   = HostCacheBytes();
    return stat;
  };
 private:
 #ifndef GRID_UVM
@@ -170,6 +213,8 @@ private:
 public:
  static void Print(void);
  static void PrintAll(void);
  static void PrintState( void* CpuPtr);
  static int   isOpen   (void* CpuPtr);
  static void  ViewClose(void* CpuPtr,ViewMode mode);
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
@@ -3,8 +3,13 @@
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+
-#define dprintf(...)
+#define MAXLINE 512
 static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
 #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
 //#define dprintf(...) 
 ////////////////////////////////////////////////////////////
@@ -23,6 +28,8 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 uint64_t  MemoryManager::DeviceEvictions;
 uint64_t  MemoryManager::DeviceDestroy;
 ////////////////////////////////////
 // Priority ordering for unlocked entries
@@ -104,15 +111,17 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-   dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceDestroy++;
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    AccCache.AccPtr=(uint64_t) NULL;
    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@@ -121,26 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 {
  ///////////////////////////////////////////////////////////////////////////
-  // Make CPU consistent, remove from Accelerator, remove entry
+  // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
-  // Cannot be locked. If allocated must be in LRU pool.
+  // Cannot be acclocked. If allocated must be in LRU pool.
  //
  // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
  //                          and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
  //                          but there is a weakness where CpuLock entries are attempted for erase
  //                          Take these OUT LRU queue when CPU locked?
  //                          Cannot take out the table as cpuLock data is important.
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
-  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
-  assert(AccCache.accLock==0);
+	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
-  assert(AccCache.cpuLock==0);
+	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
  if (AccCache.cpuLock!=0) return;
  if(AccCache.state==AccDirty) {
    Flush(AccCache);
  }
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  if(AccCache.AccPtr) {
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
-    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
-  uint64_t CpuPtr = AccCache.CpuPtr;
+  //  uint64_t CpuPtr = AccCache.CpuPtr;
-  EntryErase(CpuPtr);
+  DeviceEvictions++;
  //  EntryErase(CpuPtr);
 }
 void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 {
@@ -150,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@@ -165,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
@@ -191,6 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@@ -202,6 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -212,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 }
 void  MemoryManager::EvictVictims(uint64_t bytes)
 {
  assert(bytes<DeviceMaxBytes);
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
    if ( DeviceLRUBytes > 0){
      assert(LRU.size()>0);
-      uint64_t victim = LRU.back();
+      uint64_t victim = LRU.back(); // From the LRU
      auto AccCacheIterator = EntryLookup(victim);
      auto & AccCache = AccCacheIterator->second;
      Evict(AccCache);
    } else {
      return;
    }
  }
 }
@@ -241,11 +265,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error
  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
+    dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
-		    (uint64_t)bytes);
+	            (uint64_t)bytes,
 		    (uint64_t)AccCache.accLock);
    assert(AccCache.CpuPtr == CpuPtr);
    assert(AccCache.bytes  ==bytes);
  }
@@ -280,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@@ -292,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
  } else {
    assert(0);
  }
-  // If view is opened on device remove from LRU
+  assert(AccCache.accLock>0);
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
    dprintf("AccCache entry removed from LRU \n");
    LRUremove(AccCache);
  }
@@ -334,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  assert(AccCache.accLock>0);
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -374,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
  auto AccCacheIterator = EntryLookup(CpuPtr);
  auto & AccCache = AccCacheIterator->second;
-  if (!AccCache.AccPtr) {
+  // CPU doesn't need to free space
-     EvictVictims(bytes);
+  //  if (!AccCache.AccPtr) {
-  }
+  //    EvictVictims(bytes);
  //  }
  assert((mode==CpuRead)||(mode==CpuWrite));
  assert(AccCache.accLock==0);  // Programming error
@@ -430,20 +461,28 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 void  MemoryManager::Print(void)
 {
  PrintBytes();
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
+  std::cout << GridLogMessage << "Memory Manager                             " << std::endl;
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
-  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
+  std::cout << GridLogMessage << DeviceBytes   << " bytes allocated on device " << std::endl;
-  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
+  std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
-  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
+  std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device       " << std::endl;
-  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
+  std::cout << GridLogMessage << HostToDeviceXfer << " transfers        to   device " << std::endl;
-  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
+  std::cout << GridLogMessage << DeviceToHostXfer << " transfers        from device " << std::endl;
-  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
+  std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
-  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
+  std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
-  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
+  std::cout << GridLogMessage << DeviceEvictions  << " Evictions from device " << std::endl;
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << DeviceDestroy    << " Destroyed vectors on device " << std::endl;
-  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
+  std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 }
 void  MemoryManager::PrintAll(void)
 {
  Print();
  std::cout << GridLogMessage << std::endl;
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
@@ -453,13 +492,13 @@ void  MemoryManager::Print(void)
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
-    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
+    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 	      << "\t" << AccCache.cpuLock
 	      << "\t" << AccCache.accLock
 	      << "\t" << AccCache.LRU_valid<<std::endl;
  }
-  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
+  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 };
 int   MemoryManager::isOpen   (void* _CpuPtr) 
@@ -473,6 +512,89 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
    return 0;
  }
 }
 void MemoryManager::Audit(std::string s)
 {
  uint64_t CpuBytes=0;
  uint64_t AccBytes=0;
  uint64_t LruBytes1=0;
  uint64_t LruBytes2=0;
  uint64_t LruCnt=0;
  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
  for(auto it=LRU.begin();it!=LRU.end();it++){
    uint64_t cpuPtr = *it;
    assert(EntryPresent(cpuPtr));
    auto AccCacheIterator = EntryLookup(cpuPtr);
    auto & AccCache = AccCacheIterator->second;
    LruBytes2+=AccCache.bytes;
    assert(AccCache.LRU_valid==1);
    assert(AccCache.LRU_entry==it);
  }
  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
    auto &AccCache = it->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    CpuBytes+=AccCache.bytes;
    if( AccCache.AccPtr )    AccBytes+=AccCache.bytes;
    if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
    if( AccCache.LRU_valid ) LruCnt++;
    if ( AccCache.cpuLock || AccCache.accLock ) {
      assert(AccCache.LRU_valid==0);
      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 		<< "\t cpuLock  " << AccCache.cpuLock
 		<< "\t accLock  " << AccCache.accLock
 		<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
    }
    assert( AccCache.cpuLock== 0 ) ;
    assert( AccCache.accLock== 0 ) ;
  }
  std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
  assert(LruBytes1==LruBytes2);
  assert(LruBytes1==DeviceLRUBytes);
  std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
  assert(AccBytes==DeviceBytes);
  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
  assert(LruCnt == LRU.size());
  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
 }
 void MemoryManager::PrintState(void* _CpuPtr)
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if ( EntryPresent(CpuPtr) ){
    auto AccCacheIterator = EntryLookup(CpuPtr);
    auto & AccCache = AccCacheIterator->second;
    std::string str;
    if ( AccCache.state==Empty    ) str = std::string("Empty");
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
    if ( AccCache.state==Consistent)str = std::string("Consistent");
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
    std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
    << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
    << "\t" << AccCache.cpuLock
    << "\t" << AccCache.accLock
    << "\t" << AccCache.LRU_valid<<std::endl;
  } else {
    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
  }
 }
 NAMESPACE_END(Grid);
@@ -12,11 +12,19 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 uint64_t  MemoryManager::DeviceEvictions;
 uint64_t  MemoryManager::DeviceDestroy;
 void  MemoryManager::Audit(std::string s){};
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 void  MemoryManager::PrintState(void* CpuPtr)
 {
 std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 };
 void  MemoryManager::Print(void){};
 void  MemoryManager::PrintAll(void){};
 void  MemoryManager::NotifyDeletion(void *ptr){};
 NAMESPACE_END(Grid);
@@ -53,10 +53,11 @@ public:
  // Communicator should know nothing of the physics grid, only processor grid.
  ////////////////////////////////////////////
  int              _Nprocessors;     // How many in all
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  Coordinate _processor_coor;  // linear processor coordinate
  unsigned long    _ndimension;
  Coordinate _shm_processors;  // Which dimensions get relayed out over processors lanes.
  Coordinate _processors;      // Which dimensions get relayed out over processors lanes.
  Coordinate _processor_coor;  // linear processor coordinate
  static Grid_MPI_Comm      communicator_world;
  Grid_MPI_Comm             communicator;
  std::vector<Grid_MPI_Comm> communicator_halo;
@@ -97,6 +98,7 @@ public:
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const Coordinate & ThisProcessorCoor(void) ;
  const Coordinate & ShmGrid(void)  { return _shm_processors; }  ;
  const Coordinate & ProcessorGrid(void)     ;
  int                ProcessorCount(void)    ;
@@ -105,6 +107,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  static void BarrierWorld(void);
  ////////////////////////////////////////////////////////////
  // Reduction
@@ -128,7 +131,7 @@ public:
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
-    scalar_type * ptr = (scalar_type *)& o;
+    scalar_type * ptr = (scalar_type *)& o; // Safe alias 
    GlobalSumVector(ptr,words);
  }
@@ -142,17 +145,17 @@ public:
 		      int bytes);
  double StencilSendToRecvFrom(void *xmit,
-			       int xmit_to_rank,
+			       int xmit_to_rank,int do_xmit,
 			       void *recv,
-			       int recv_from_rank,
+			       int recv_from_rank,int do_recv,
 			       int bytes,int dir);
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
-				    int xmit_to_rank,
+				    int xmit_to_rank,int do_xmit,
 				    void *recv,
-				    int recv_from_rank,
+				    int recv_from_rank,int do_recv,
-				    int bytes,int dir);
+				    int xbytes,int rbytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm,_shm_processors);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
@@ -124,12 +124,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  Coordinate parent_processor_coor(_ndimension,0);
  Coordinate parent_processors    (_ndimension,1);
-
+  Coordinate shm_processors       (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
    shm_processors       [pad+d]=parent._shm_processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -154,6 +155,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
    if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@@ -335,23 +337,23 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int dest,
+						     int dest, int dox,
 						     void *recv,
-						     int from,
+						     int from, int dor,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int dest,
+							 int dest,int dox,
 							 void *recv,
-							 int from,
+							 int from,int dor,
-							 int bytes,int dir)
+							 int xbytes,int rbytes,int dir)
 {
  int ncomm  =communicator_halo.size();
  int commdir=dir%ncomm;
@@ -370,30 +372,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
-    off_node_bytes+=bytes;
+      off_node_bytes+=rbytes;
    }
  }
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
-    off_node_bytes+=bytes;
+      off_node_bytes+=xbytes;
    } else {
    // TODO : make a OMP loop on CPU, call threaded bcopy
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
+      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
    acceleratorCopySynchronise(); // MPI prob slower
    }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
@@ -435,6 +435,10 @@ int CartesianCommunicator::RankWorld(void){
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
  int ierr = MPI_Barrier(communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
@@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) 
  : CartesianCommunicator(processors) 
 {
  _shm_processors = Coordinate(processors.size(),1);
  srank=0;
  SetCommunicator(communicator_world);
 }
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
  _shm_processors = Coordinate(processors.size(),1);
  _processors = processors;
  _ndimension = processors.size();  assert(_ndimension>=1);
  _processor_coor.resize(_ndimension);
@@ -102,6 +104,7 @@ int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@@ -111,21 +114,21 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						     int xmit_to_rank,
+						     int xmit_to_rank,int dox,
 						     void *recv,
-						     int recv_from_rank,
+						     int recv_from_rank,int dor,
 						     int bytes, int dir)
 {
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
-							 int xmit_to_rank,
+							 int xmit_to_rank,int dox,
 							 void *recv,
-							 int recv_from_rank,
+							 int recv_from_rank,int dor,
-							 int bytes, int dir)
+							 int xbytes,int rbytes, int dir)
 {
-  return 2.0*bytes;
+  return xbytes+rbytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
@@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void)
  //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
  return ShmCommBufs[ShmRank];
 }
 static inline int divides(int a,int b)
 {
  return ( b == ( (b/a)*a ) );
 }
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
  ////////////////////////////////////////////////////////////////
  // Allow user to configure through environment variable
  ////////////////////////////////////////////////////////////////
  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
  if ( str ) {
    std::vector<int> IntShmDims;
    GridCmdOptionIntVector(std::string(str),IntShmDims);
    assert(IntShmDims.size() == WorldDims.size());
    long ShmSize = 1;
    for (int dim=0;dim<WorldDims.size();dim++) {
      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
      assert(divides(ShmDims[dim],WorldDims[dim]));
    }
    assert(ShmSize == WorldShmSize);
    return;
  }
  ////////////////////////////////////////////////////////////////
  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
  int ndimension = WorldDims.size();
  ShmDims=Coordinate(ndimension,1);
  std::vector<int> primes({2,3,5});
  int dim = 0;
  int last_dim = ndimension - 1;
  int AutoShmSize = 1;
  while(AutoShmSize != WorldShmSize) {
    int p;
    for(p=0;p<primes.size();p++) {
      int prime=primes[p];
      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
        && divides(prime,WorldShmSize/AutoShmSize)  ) {
  AutoShmSize*=prime;
  ShmDims[dim]*=prime;
  last_dim = dim;
  break;
      }
    }
    if (p == primes.size() && last_dim == dim) {
      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
      exit(EXIT_FAILURE);
    }
    dim=(dim+1) %ndimension;
  }
 }
 NAMESPACE_END(Grid); 
@@ -93,9 +93,10 @@ public:
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
-  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  // Turns MPI_COMM_WORLD into right layout for Cartesian
-  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
-  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); 
  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
@@ -27,6 +27,8 @@ Author: Christoph Lehner <christoph@lhnr.de>
 *************************************************************************************/
 /*  END LEGAL */
 #define Mheader "SharedMemoryMpi: "
 #include <Grid/GridCore.h>
 #include <pwd.h>
@@ -36,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCl
+#ifdef GRID_SYCL
-
+#define GRID_SYCL_LEVEL_ZERO_IPC
 #include <syscall.h>
 #define SHM_SOCKETS 
 #endif
 #include <sys/socket.h>
 #include <sys/un.h>
 NAMESPACE_BEGIN(Grid); 
-#define header "SharedMemoryMpi: "
+
 #ifdef SHM_SOCKETS
 /*
 * Barbaric extra intranode communication route in case we need sockets to pass FDs
 * Forced by level_zero not being nicely designed
 */
 static int sock;
 static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
 static char sock_path[256];
 class UnixSockets {
 public:
  static void Open(int rank)
  {
    int errnum;
    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
    struct sockaddr_un sa_un = { 0 };
    sa_un.sun_family = AF_UNIX;
    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
    unlink(sa_un.sun_path);
    if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
      perror("bind failure");
      exit(EXIT_FAILURE);
    }
  }
  static int RecvFileDescriptor(void)
  {
    int n;
    int fd;
    char buf[1];
    struct iovec iov;
    struct msghdr msg;
    struct cmsghdr *cmsg;
    char cms[CMSG_SPACE(sizeof(int))];
    iov.iov_base = buf;
    iov.iov_len = 1;
    memset(&msg, 0, sizeof msg);
    msg.msg_name = 0;
    msg.msg_namelen = 0;
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_control = (caddr_t)cms;
    msg.msg_controllen = sizeof cms;
    if((n=recvmsg(sock, &msg, 0)) < 0) {
      perror("recvmsg failed");
      return -1;
    }
    if(n == 0){
      perror("recvmsg returned 0");
      return -1;
    }
    cmsg = CMSG_FIRSTHDR(&msg);
    memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
    return fd;
  }
  static void SendFileDescriptor(int fildes,int xmit_to_rank)
  {
    struct msghdr msg;
    struct iovec iov;
    struct cmsghdr *cmsg = NULL;
    char ctrl[CMSG_SPACE(sizeof(int))];
    char data = ' ';
    memset(&msg, 0, sizeof(struct msghdr));
    memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
    iov.iov_base = &data;
    iov.iov_len = sizeof(data);
    sprintf(sock_path,sock_path_fmt,xmit_to_rank);
    struct sockaddr_un sa_un = { 0 };
    sa_un.sun_family = AF_UNIX;
    snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
    msg.msg_name = (void *)&sa_un;
    msg.msg_namelen = sizeof(sa_un);
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;
    msg.msg_controllen =  CMSG_SPACE(sizeof(int));
    msg.msg_control = ctrl;
    cmsg = CMSG_FIRSTHDR(&msg);
    cmsg->cmsg_level = SOL_SOCKET;
    cmsg->cmsg_type = SCM_RIGHTS;
    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
    *((int *) CMSG_DATA(cmsg)) = fildes;
    sendmsg(sock, &msg, 0);
  };
 };
 #endif
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
@@ -64,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  if ( WorldRank == 0) {
-    std::cout << header " World communicator of size " <<WorldSize << std::endl;  
+    std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;  
-    std::cout << header " Node  communicator of size " <<WorldShmSize << std::endl;
+    std::cout << Mheader " Node  communicator of size " <<WorldShmSize << std::endl;
  }
  // WorldShmComm, WorldShmSize, WorldShmRank
@@ -152,7 +262,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
  }
  return log2size;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -165,63 +275,11 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
-  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
+  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
-  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
+  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
 }
 static inline int divides(int a,int b)
 {
  return ( b == ( (b/a)*a ) );
 }
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
  ////////////////////////////////////////////////////////////////
  // Allow user to configure through environment variable
  ////////////////////////////////////////////////////////////////
  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
  if ( str ) {
    std::vector<int> IntShmDims;
    GridCmdOptionIntVector(std::string(str),IntShmDims);
    assert(IntShmDims.size() == WorldDims.size());
    long ShmSize = 1;
    for (int dim=0;dim<WorldDims.size();dim++) {
      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
      assert(divides(ShmDims[dim],WorldDims[dim]));
    }
    assert(ShmSize == WorldShmSize);
    return;
  }
-  ////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
  int ndimension = WorldDims.size();
  ShmDims=Coordinate(ndimension,1);
  std::vector<int> primes({2,3,5});
  int dim = 0;
  int last_dim = ndimension - 1;
  int AutoShmSize = 1;
  while(AutoShmSize != WorldShmSize) {
    int p;
    for(p=0;p<primes.size();p++) {
      int prime=primes[p];
      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
        && divides(prime,WorldShmSize/AutoShmSize)  ) {
 	AutoShmSize*=prime;
 	ShmDims[dim]*=prime;
 	last_dim = dim;
 	break;
      }
    }
    if (p == primes.size() && last_dim == dim) {
      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
      exit(EXIT_FAILURE);
    }
    dim=(dim+1) %ndimension;
  }
 }
 void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -294,6 +352,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  Coordinate HyperCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
  SHM = ShmDims;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
@@ -341,7 +400,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
-void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
@@ -353,6 +412,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);
  GetShmDims(WorldDims,ShmDims);
  SHM=ShmDims;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -391,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
@@ -476,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
-  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+  std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  SharedMemoryZero(ShmCommBuf,bytes);
@@ -519,16 +580,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
-	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
+	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
  std::cout<< "Setting up IPC"<<std::endl;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef SHM_SOCKETS
  UnixSockets::Open(WorldShmRank);
 #endif
  for(int r=0;r<WorldShmSize;r++){
    MPI_Barrier(WorldShmComm);
 #ifndef GRID_MPI3_SHM_NONE
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
@@ -536,7 +602,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    void * thisBuf = ShmCommBuf;
    if(!Stencil_force_mpi) {
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
-    typedef struct { int fd; pid_t pid ; } clone_mem_t;
+    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
@@ -547,13 +613,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
+	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
      memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
 #ifdef SHM_SOCKETS
      for(int rr=0;rr<WorldShmSize;rr++){
 	if(rr!=r){
 	  UnixSockets::SendFileDescriptor(handle.fd,rr);
 	}
      }
 #endif
    }
 #endif
 #ifdef GRID_CUDA
@@ -581,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // Share this IPC handle across the Shm Comm
    //////////////////////////////////////////////////
    { 
      MPI_Barrier(WorldShmComm);
      int ierr=MPI_Bcast(&handle,
 			 sizeof(handle),
 			 MPI_BYTE,
@@ -596,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
      int myfd;
 #ifdef SHM_SOCKETS
      myfd=UnixSockets::RecvFileDescriptor();
 #else
      std::cout<<"mapping seeking remote pid/fd "
 	       <<handle.pid<<"/"
 	       <<handle.fd<<std::endl;
@@ -603,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
-      int myfd  = syscall(438,pidfd,handle.fd,0);
+      myfd  = syscall(438,pidfd,handle.fd,0);
-
+      int err_t = errno;
-      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
+      if (myfd < 0) {
-      
+        fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
 	perror("pidfd_getfd failed ");
 	assert(0);
      }
 #endif
      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
-	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
+	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
-	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
+	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@@ -647,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #else
    WorldShmCommBufs[r] = ShmCommBuf;
 #endif
    MPI_Barrier(WorldShmComm);
  }
  _ShmAllocBytes=bytes;
@@ -658,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -695,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
-    //    std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
@@ -705,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -752,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
-  std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
+  std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
@@ -48,9 +48,10 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  _ShmSetup=1;
 }
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
  optimal_comm = WorldComm;
  SHM = Coordinate(processors.size(),1);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
@@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
  }
 }
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 template <typename T>
 T iDivUp(T a, T b) // Round a / b to nearest higher integer value
 { return (a % b != 0) ? (a / b + 1) : (a / b); }
 template <typename T>
 __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
 {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >= e1*e2) return;
    int n, b, o;
    n = idx / e2;
    b = idx % e2;
    o = n*stride + b;
    vector[2*idx + 0] = lo + o;
    vector[2*idx + 1] = ro + o;
 }
 #endif
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
@@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int ent=0;
  if(cbmask == 0x3 ){
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
 #endif
  } else { 
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int ent=0;
  if ( cbmask == 0x3 ) {
 #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
    ent = e1*e2;
    dim3 blockSize(acceleratorThreads());
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
    accelerator_barrier();
 #else
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  =n*stride;
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
    }}
 #endif
  } else {
    for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
@@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
 #include <Grid/lattice/Lattice_basis.h>
 #include <Grid/lattice/Lattice_crc.h>
@@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
  typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
+  //  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = vobj::vector_type::Nsimd();
@@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  autoView( ret_v , ret, AcceleratorWrite);
  autoView( lhs_v , lhs, AcceleratorRead);
@@ -53,6 +54,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -70,6 +72,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -86,6 +89,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
@@ -106,6 +110,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -119,6 +124,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -133,6 +139,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -146,6 +153,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -163,6 +171,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mult");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -177,6 +186,7 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("mac");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -191,6 +201,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("sub");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -204,6 +215,7 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 }
 template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  GRID_TRACE("add");
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
  autoView( ret_v , ret, AcceleratorWrite);
@@ -218,6 +230,7 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 template<class sobj,class vobj> inline
 void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
  GRID_TRACE("axpy");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -231,6 +244,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 }
 template<class sobj,class vobj> inline
 void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
  GRID_TRACE("axpby");
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
@@ -246,11 +260,13 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
    return axpby_norm_fast(ret,a,b,x,y);
 }
@@ -88,6 +88,13 @@ public:
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
    accessor.ViewClose();
  }
  // Helper function to print the state of this object in the AccCache
  void PrintCacheState(void)
  {
    MemoryManager::PrintState(this->_odata);
  }
  /////////////////////////////////////////////////////////////////////////////////
  // Return a view object that may be dereferenced in site loops.
  // The view is trivially copy constructible and may be copied to an accelerator device
@@ -110,6 +117,7 @@ public:
  ////////////////////////////////////////////////////////////////////////////////
  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -133,6 +141,7 @@ public:
  }
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -156,6 +165,7 @@ public:
  }
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
  {
    GRID_TRACE("ExpressionTemplateEval");
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
    assert(egrid!=nullptr);
@@ -281,8 +291,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
    auto me =   View(AcceleratorWriteDiscard);
    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -296,8 +306,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
    auto me =   View(AcceleratorWriteDiscard);
    auto him= r.View(AcceleratorRead);
    auto me =   View(AcceleratorWriteDiscard);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -0,0 +1,55 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_crc.h
    Copyright (C) 2021
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
  typedef typename vobj::tensor_reduced normtype;
  typedef typename normtype::scalar_object scalar;
  std::vector<scalar> sff;
  sliceSum(ff,sff,mu);
  for(int t=0;t<sff.size();t++){
    std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
  }
 }
 template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 {
  autoView( buf_v , buf, CpuRead);
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 }
 #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 NAMESPACE_END(Grid);
@@ -32,7 +32,6 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -82,7 +81,6 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -130,7 +128,6 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs.Grid();
@@ -96,9 +96,6 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
  GridBase *grid=l.Grid();
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@@ -125,14 +122,17 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj>
 typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
  typename vobj::scalar_object s;
  peekSite(s,l,site);
  return s;
 }        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  GridBase *grid=l.Grid();
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nsimd = grid->Nsimd();
  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@@ -173,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  scalar_type * vp = (scalar_type *)&l[odx];
+  const vector_type *vp = (const vector_type *) &l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    pt[w] = vp[idx+w*Nsimd];
+    pt[w] = getlane(vp[w],idx);
  }
  return;
@@ -210,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
  idx= grid->iIndex(site);
  odx= grid->oIndex(site);
-  scalar_type * vp = (scalar_type *)&l[odx];
+  vector_type * vp = (vector_type *)&l[odx];
  scalar_type * pt = (scalar_type *)&s;
  for(int w=0;w<words;w++){
-    vp[idx+w*Nsimd] = pt[w];
+    putlane(vp[w],pt[w],idx);
  }
  return;
 };
@@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
 #if defined(GRID_SYCL)
 #include <Grid/lattice/Lattice_reduction_sycl.h>
 #endif
 NAMESPACE_BEGIN(Grid);
@@ -91,10 +94,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  for(int i=0;i<nthread;i++){
    ssum = ssum+sumarray[i];
  } 
-  
+  return ssum;
  typedef typename vobj::scalar_object ssobj;
  ssobj ret = ssum;
  return ret;
 }
 /*
 Threaded max, don't use for now
@@ -127,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
@@ -136,25 +136,61 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
 {
  Integer osites = arg.Grid()->oSites();
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  return sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  return sum_cpu(&arg_v[0],osites);
 #endif  
 }
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+  auto ssum = rankSum(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
 template<class vobj>
 inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
 {
 #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu(&arg_v[0],osites);
+  return sum_gpu_large(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_cpu(&arg_v[0],osites);
+  return sum_cpu(&arg_v[0],osites);
 #endif
 }
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
  auto ssum = rankSumLarge(arg);
  arg.Grid()->GlobalSum(ssum);
  return ssum;
 }
@@ -197,7 +233,6 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  ComplexD  nrm;
@@ -207,24 +242,40 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  // Might make all code paths go this way.
 #if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // This code could read coalesce
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, nsimd,{
 	auto x_l = left_v(ss);
 	auto y_l = right_v(ss);
 	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
    });
  }
 #else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, 1,{
+    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v[ss];
+	auto x_l = left_v(ss);
-	auto y_l = right_v[ss];
+	auto y_l = right_v(ss);
-	inner_tmp_v[ss]=innerProductD(x_l,y_l);
+	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
-
+#endif
  // This is in single precision and fails some tests
-  auto anrm = sum(inner_tmp_v,sites);  
+  auto anrm = sumD(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@@ -257,8 +308,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(z,x);
  conformable(x,y);
-  typedef typename vobj::scalar_type scalar_type;
+  //  typedef typename vobj::vector_typeD vector_type;
  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x.Grid();
@@ -270,17 +320,29 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-
+#if 0
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-  accelerator_for( ss, sites, 1,{
+  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v[ss]+b*y_v[ss];
+      auto tmp = a*x_v(ss)+b*y_v(ss);
-      inner_tmp_v[ss]=innerProductD(tmp,tmp);
+      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
-      z_v[ss]=tmp;
+      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 #endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@@ -290,7 +352,6 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 {
  conformable(left,right);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  Vector<ComplexD> tmp(2);
@@ -434,6 +495,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
 sliceSum(const Lattice<vobj> &Data,int orthogdim)
 {
  std::vector<typename vobj::scalar_object> result;
  sliceSum(Data,result,orthogdim);
  return result;
 }
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
@@ -539,6 +608,7 @@ template<class vobj>
 static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			    int orthogdim,RealD scale=1.0) 
 {
  // perhaps easier to just promote A to a field and use regular madd
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -569,8 +639,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
    for(int l=0;l<Nsimd;l++){
      grid->iCoorFromIindex(icoor,l);
      int ldx =r+icoor[orthogdim]*rd;
-      scalar_type *as =(scalar_type *)&av;
+      av.putlane(scalar_type(a[ldx])*zscale,l);
      as[l] = scalar_type(a[ldx])*zscale;
    }
    tensor_reduced at; at=av;
@@ -610,7 +679,6 @@ template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -664,7 +732,6 @@ template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -718,7 +785,6 @@ template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs.Grid();
@@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 template <class Iterator>
-void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
  int device;
 #ifdef GRID_CUDA
@@ -37,14 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  
+  /*  
  std::cout << GridLogDebug << "GPU has:" << std::endl;
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  
+  */  
  if (warpSize != WARP_SIZE) {
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
    exit(EXIT_FAILURE);
@@ -52,10 +51,14 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
    return 0;
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
-  
+  return 1;
 }
 template <class sobj, class Iterator>
@@ -195,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_objectD sobj;
  typedef decltype(lat) Iterator;
@@ -204,17 +207,77 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
-  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
-  Integer smemSize = numThreads * sizeof(sobj);
+  assert(ok);
  Integer smemSize = numThreads * sizeof(sobj);
  // Move out of UVM
  // Turns out I had messed up the synchronise after move to compute stream
  // as running this on the default stream fools the synchronise
 #undef UVM_BLOCK_BUFFER  
 #ifndef UVM_BLOCK_BUFFER  
  commVector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
  sobj result;
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 #else
  Vector<sobj> buffer(numBlocks);
  sobj *buffer_v = &buffer[0];
-  
+  sobj result;
-  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
+  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
  accelerator_barrier();
-  auto result = buffer_v[0];
+  result = *buffer_v;
 #endif
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<vector> buffer(osites);
  vector *dat = (vector *)lat;
  vector *buf = &buffer[0];
  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,1,{
 	buf[ss] = dat[ss*words+w];
      });
    ret_p[w] = sumD_gpu_small(tbuf,osites);
  }
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_objectD sobj;
  sobj ret;
  Integer nsimd= vobj::Nsimd();
  Integer size = osites*nsimd;
  Integer numThreads, numBlocks;
  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
  if ( ok ) {
    ret = sumD_gpu_small(lat,osites);
  } else {
    ret = sumD_gpu_large(lat,osites);
  }
  return ret;
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -227,6 +290,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
  return result;
 }
-
+template <class vobj>
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
@@ -0,0 +1,125 @@
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
  sobj identity; zeroit(identity);
  sobj ret ; 
  Integer nsimd= vobj::Nsimd();
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
     cgh.parallel_for(cl::sycl::range<1>{osites},
 		      Reduction,
 		      [=] (cl::sycl::id<1> item, auto &sum) {
      auto osite   = item[0];
      sum +=Reduce(lat[osite]);
     });
   });
  theGridAccelerator->wait();
  ret = mysum[0];
  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 {
  return sumD_gpu_tensor(lat,osites);
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
 {
  return sumD_gpu_large(lat,osites);
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
  return sumD_gpu_large(lat,osites);
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
 inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu(lat,osites);
  return result;
 }
 template <class vobj>
 inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 {
  typedef typename vobj::scalar_object sobj;
  sobj result;
  result = sumD_gpu_large(lat,osites);
  return result;
 }
 NAMESPACE_END(Grid);
 /*
 template<class Double> Double svm_reduce(Double *vec,uint64_t L)
 {
  Double sumResult; zeroit(sumResult);
  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
  Double identity;  zeroit(identity);
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
     cgh.parallel_for(cl::sycl::range<1>{L},
 		      Reduction,
 		      [=] (cl::sycl::id<1> index, auto &sum) {
 	 sum +=vec[index];
     });
   });
  theGridAccelerator->wait();
  Double ret = d_sum[0];
  free(d_sum,*theGridAccelerator);
  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
  return ret;
 }
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 {
  typedef typename vobj::vector_type  vector;
  typedef typename vobj::scalar_type  scalar;
  typedef typename vobj::scalar_typeD scalarD;
  typedef typename vobj::scalar_objectD sobjD;
  sobjD ret;
  scalarD *ret_p = (scalarD *)&ret;
  const int nsimd = vobj::Nsimd();
  const int words = sizeof(vobj)/sizeof(vector);
  Vector<scalar> buffer(osites*nsimd);
  scalar *buf = &buffer[0];
  vector *dat = (vector *)lat;
  for(int w=0;w<words;w++) {
    accelerator_for(ss,osites,nsimd,{
 	int lane = acceleratorSIMTlane(nsimd);
 	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
    });
    //Precision change at this point is to late to gain precision
    ret_p[w] = svm_reduce(buf,nsimd*osites);
  }
  return ret;
 }
 */
@@ -424,9 +424,33 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
 #if 1
    thread_for( lidx, _grid->lSites(), {
 	int gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
 	Coordinate pcoor;
 	Coordinate lcoor;
 	Coordinate gcoor;
 	_grid->LocalIndexToLocalCoor(lidx,lcoor);
 	pcoor=_grid->ThisProcessorCoor();
 	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	assert(rank == _grid->ThisRank() );
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
    });
 #else
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
 	// Where is it?
 	int rank;
 	int o_idx;
@@ -443,6 +467,7 @@ public:
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
 #endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
@@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
  autoView(full_v, full, AcceleratorRead);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++) {
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(half_v[ssh],full_v(ss));
    }
  });
 }
 template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
  autoView(full_v , full, AcceleratorWrite);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++){
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(full_v[ss],half_v(ssh));
    }
  });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
@@ -124,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
 #endif
 accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
-  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
+  precisionChange(out,in);
 }
 accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
-  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
+  precisionChange(out,in);
 }
 template<typename T1,typename T2>
@@ -218,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
  }
 }
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               const std::vector<Lattice<vobj>> &fineData,
                               const VLattice &Basis)
 {
  int NBatch = fineData.size();
  assert(coarseData.size() == NBatch);
  GridBase * fine  = fineData[0].Grid();
  GridBase * coarse= coarseData[0].Grid();
  Lattice<iScalar<CComplex>> ip(coarse);
  std::vector<Lattice<vobj>> fineDataCopy = fineData;
  autoView(ip_, ip, AcceleratorWrite);
  for(int v=0;v<nbasis;v++) {
    for (int k=0; k<NBatch; k++) {
      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
        convertType(coarseData_[sc](v),ip_[sc]);
      });
      // improve numerical stability of projection
      // |fine> = |fine> - <basis|fine> |basis>
      ip=-ip;
      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
    }
  }
 }
 template<class vobj,class vobj2,class CComplex>
  inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -520,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #endif
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
                               std::vector<Lattice<vobj>> &fineData,
                               const VLattice &Basis)
 {
  int NBatch = coarseData.size();
  assert(fineData.size() == NBatch);
  GridBase * fine   = fineData[0].Grid();
  GridBase * coarse = coarseData[0].Grid();
  for (int k=0; k<NBatch; k++)
    fineData[k]=Zero();
  for (int i=0;i<nbasis;i++) {
    for (int k=0; k<NBatch; k++) {
      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
    }
  }
 }
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
@@ -588,9 +707,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;
-  autoView( t_v , To, AcceleratorWrite);
+  autoView( t_v , To, CpuWrite);
-  autoView( f_v , From, AcceleratorRead);
+  autoView( f_v , From, CpuRead);
-  accelerator_for(idx,Fg->lSites(),1,{
+  thread_for(idx,Fg->lSites(),{
    sobj s;
    Coordinate Fcoor(nd);
    Coordinate Tcoor(nd);
@@ -603,15 +722,20 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
    }
    if (in_region) {
-      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
+#if 0      
-      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
-      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
-      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
      scalar_type * fp = (scalar_type *)&f_v[odx_f];
      scalar_type * tp = (scalar_type *)&t_v[odx_t];
      for(int w=0;w<words;w++){
-	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
+	tp[w].putlane(fp[w].getlane(idx_f),idx_t);
      }
 #else
    peekLocalSite(s,f_v,Fcoor);
    pokeLocalSite(s,t_v,Tcoor);
 #endif
    }
  });
 }
@@ -785,7 +909,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 template<class vobj>
-void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
  typedef typename vobj::scalar_object sobj;
@@ -1010,9 +1134,27 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
  });
 }
-//Convert a Lattice from one precision to another
+//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
 template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
  typedef typename VobjOut::vector_type Vout;
  typedef typename VobjIn::vector_type Vin;
  const int N = sizeof(VobjOut)/sizeof(Vout);
  conformable(out.Grid(),in.Grid());
  out.Checkerboard() = in.Checkerboard();
  int nsimd = out.Grid()->Nsimd();
  autoView( out_v  , out, AcceleratorWrite);
  autoView(  in_v ,   in, AcceleratorRead);
  accelerator_for(idx,out.Grid()->oSites(),1,{
      Vout *vout = (Vout *)&out_v[idx];
      Vin  *vin  = (Vin  *)&in_v[idx];
      precisionChange(vout,vin,N);
  });
 }
 //Convert a Lattice from one precision to another (original, slow implementation)
 template<class VobjOut, class VobjIn>
 void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
  assert(out.Grid()->Nd() == in.Grid()->Nd());
  for(int d=0;d<out.Grid()->Nd();d++){
@@ -1027,7 +1169,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  int ndim = out.Grid()->Nd();
  int out_nsimd = out_grid->Nsimd();
-    
+  int in_nsimd = in_grid->Nsimd();
  std::vector<Coordinate > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
@@ -1058,6 +1200,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
  });
 }
 //The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
 class precisionChangeWorkspace{
  std::pair<Integer,Integer>* fmap_device; //device pointer
  //maintain grids for checking
  GridBase* _out_grid;
  GridBase* _in_grid;
 public:
  precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
    //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
    assert(out_grid->Nd() == in_grid->Nd());
    for(int d=0;d<out_grid->Nd();d++){
      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
    }
    int Nsimd_out = out_grid->Nsimd();
    std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
    for(int lane=0; lane < out_grid->Nsimd(); lane++)
      out_grid->iCoorFromIindex(out_icorrs[lane], lane);
    std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
    thread_for(out_oidx,out_grid->oSites(),{
 	Coordinate out_ocorr; 
 	out_grid->oCoorFromOindex(out_ocorr, out_oidx);
 	Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
 	for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
 	  out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
 	  //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
 	  //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
 	  //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
 	  int in_oidx = 0, in_lane = 0;
 	  for(int d=0;d<in_grid->_ndimension;d++){
 	    in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
 	    in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
 	  }
 	  fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
 	}
      });
    //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
    size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
    fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
    acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); 
  }
  //Prevent moving or copying
  precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
  precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
  precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
  std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
  void checkGrids(GridBase* out, GridBase* in) const{
    conformable(out, _out_grid);
    conformable(in, _in_grid);
  }
  ~precisionChangeWorkspace(){
    acceleratorFreeDevice(fmap_device);
  }
 };
 //We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
 //*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
 template<class VobjOut, class VobjIn>
 auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
  if(out.Grid() == in.Grid()){
    precisionChangeFast(out,in);
    return 1;
  }else{
    return 0;
  }
 }
 template<class VobjOut, class VobjIn>
 int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
  return 0;
 }
 //Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
 //which contains the mapping data.
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
  if(_precisionChangeFastWrap(out,in,0)) return;
  static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
  out.Checkerboard() = in.Checkerboard();
  constexpr int Nsimd_out = VobjOut::Nsimd();
  workspace.checkGrids(out.Grid(),in.Grid());
  std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
  //Do the copy/precision change
  autoView( out_v , out, AcceleratorWrite);
  autoView( in_v , in, AcceleratorRead);
  accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
      std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
      for(int out_lane=0; out_lane < Nsimd_out; out_lane++){      
 	int in_oidx = fmap_osite[out_lane].first;
 	int in_lane = fmap_osite[out_lane].second;
 	copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
      }
    });
 }
 //Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
 //or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  if(_precisionChangeFastWrap(out,in,0)) return;   
  precisionChangeWorkspace workspace(out.Grid(), in.Grid());
  precisionChange(out, in, workspace);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Communicate between grids
 ////////////////////////////////////////////////////////////////////////////////
@@ -0,0 +1,136 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/PaddedCell.h
    Copyright (C) 2019
 Author: Peter Boyle pboyle@bnl.gov
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
  int dims;
  int depth;
  std::vector<GridCartesian *> grids;
  ~PaddedCell()
  {
    DeleteGrids();
  }
  PaddedCell(int _depth,GridCartesian *_grid)
  {
    unpadded_grid = _grid;
    depth=_depth;
    dims=_grid->Nd();
    AllocateGrids();
    Coordinate local     =unpadded_grid->LocalDimensions();
    for(int d=0;d<dims;d++){
      assert(local[d]>=depth);
    }
  }
  void DeleteGrids(void)
  {
    for(int d=0;d<grids.size();d++){
      delete grids[d];
    }
    grids.resize(0);
  };
  void AllocateGrids(void)
  {
    Coordinate local     =unpadded_grid->LocalDimensions();
    Coordinate simd      =unpadded_grid->_simd_layout;
    Coordinate processors=unpadded_grid->_processors;
    Coordinate plocal    =unpadded_grid->LocalDimensions();
    Coordinate global(dims);
    // expand up one dim at a time
    for(int d=0;d<dims;d++){
      plocal[d] += 2*depth; 
      for(int d=0;d<dims;d++){
 	global[d] = plocal[d]*processors[d];
      }
      grids.push_back(new GridCartesian(global,simd,processors));
    }
  };
  template<class vobj>
  inline Lattice<vobj> Extract(Lattice<vobj> &in)
  {
    Lattice<vobj> out(unpadded_grid);
    Coordinate local     =unpadded_grid->LocalDimensions();
    Coordinate fll(dims,depth); // depends on the MPI spread
    Coordinate tll(dims,0); // depends on the MPI spread
    localCopyRegion(in,out,fll,tll,local);
    return out;
  }
  template<class vobj>
  inline Lattice<vobj> Exchange(Lattice<vobj> &in)
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
    Lattice<vobj> tmp = in;
    for(int d=0;d<dims;d++){
      tmp = Expand(d,tmp); // rvalue && assignment
    }
    return tmp;
  }
  // expand up one dim at a time
  template<class vobj>
  inline Lattice<vobj> Expand(int dim,Lattice<vobj> &in)
  {
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
    Lattice<vobj>  padded(new_grid);
    Lattice<vobj> shifted(old_grid);    
    Coordinate local     =old_grid->LocalDimensions();
    Coordinate plocal    =new_grid->LocalDimensions();
    if(dim==0) conformable(old_grid,unpadded_grid);
    else       conformable(old_grid,grids[dim-1]);
    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
    // Middle bit
    for(int x=0;x<local[dim];x++){
      InsertSliceLocal(in,padded,x,depth+x,dim);
    }
    // High bit
    shifted = Cshift(in,dim,depth);
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
    // Low bit
    shifted = Cshift(in,dim,-depth);
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
    return padded;
  }
 };
 NAMESPACE_END(Grid);
@@ -65,29 +65,40 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogDslash     (1, "Dslash", GridLogColours, "BLUE");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
-  GridLogError.Active(0);
+  GridLogError.Active(1);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogMemory.Active(0); 
  GridLogTracing.Active(0); 
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogDslash.Active(0);
  GridLogIntegrator.Active(1);
  GridLogColours.Active(0);
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
+    if (logstreams[i] == std::string("Tracing"))     GridLogTracing.Active(1);
    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
    if (logstreams[i] == std::string("Debug"))       GridLogDebug.Active(1);
    if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
-    if (logstreams[i] == std::string("Integrator"))  GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("Dslash"))      GridLogDslash.Active(1);
    if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
    if (logstreams[i] == std::string("NoHMC"))       GridLogHMC.Active(0);
    if (logstreams[i] == std::string("Colours"))     GridLogColours.Active(1);
  }
 }
@@ -138,7 +138,8 @@ public:
        stream << std::setw(log.topWidth);
      }
      stream << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left;
+      //      stream << log.colour() <<  std::left;
      stream <<  std::left;
      if (log.chanWidth > 0)
      {
        stream << std::setw(log.chanWidth);
@@ -153,9 +154,9 @@ public:
 	stream << log.evidence()
 	       << now	       << log.background() << " : " ;
      }
-      stream << log.colour();
+      //      stream << log.colour();
      stream <<  std::right;
      stream.flags(f);
      return stream;
    } else { 
      return devnull;
@@ -180,8 +181,12 @@ extern GridLogger GridLogWarning;
 extern GridLogger GridLogMessage;
 extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogDslash;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern GridLogger GridLogMemory;
 extern GridLogger GridLogTracing;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
@@ -31,6 +31,7 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include <map>
 #include <pwd.h>
@@ -576,6 +577,8 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
  // in principle should do the line below, but that breaks backard compatibility with old data
  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
@@ -652,7 +655,8 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    ildgfmt.field     = std::string("su3gauge");
+    const std::string stNC = std::to_string( Nc ) ;
    ildgfmt.field          = std::string("su"+stNC+"gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -869,7 +873,8 @@ class IldgReader : public GridLimeReader {
    } else { 
      assert(found_ildgFormat);
-      assert ( ildgFormat_.field == std::string("su3gauge") );
+      const std::string stNC = std::to_string( Nc ) ;
      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -877,7 +882,7 @@ class IldgReader : public GridLimeReader {
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
+      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
@@ -6,8 +6,8 @@
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
+    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  const int x=0;
+  assert( Nc < 4 && Nc > 1 ) ;
  const int y=1;
  const int z=2;
  for(int mu=0;mu<Nd;mu++){
    #if Nc == 2
      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
    #else
      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #endif
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -278,7 +282,6 @@ struct GaugeSimpleMunger{
 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -317,8 +320,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -330,8 +333,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
@@ -9,6 +9,7 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,6 +31,8 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <string>
 NAMESPACE_BEGIN(Grid);
 using namespace Grid;
@@ -39,9 +42,11 @@ using namespace Grid;
 ////////////////////////////////////////////////////////////////////////////////
 class NerscIO : public BinaryIO { 
 public:
  typedef Lattice<vLorentzColourMatrixD> GaugeField;
  // Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
  static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
@@ -145,15 +150,17 @@ public:
    std::string format(header.floating_point);
-    int ieee32big = (format == std::string("IEEE32BIG"));
+    const int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
+    const int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
+    const int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
+    const int ieee64    = (format == std::string("IEEE64") || \
 			   format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
+    const std::string stNC = std::to_string( Nc ) ;
    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -164,7 +171,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
+    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -198,7 +205,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
@@ -209,27 +216,29 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label);
+    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
-    ///////////////////////////////////////////
+    header.sequence_number = sequence_number;
-    // Following should become arguments
+    header.ensemble_id     = ens_id;
    ///////////////////////////////////////////
    header.sequence_number = 1;
    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
    header.hdr_version     = "1.0" ;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -243,10 +252,14 @@ public:
    uint64_t offset;
-    // Sod it -- always write 3x3 double
+    // Sod it -- always write NcxNc double
    header.floating_point  = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+    const std::string stNC = std::to_string( Nc ) ;
-    GaugeSimpleUnmunger<fobj3D,sobj> munge;
+    if( two_row ) {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
    } else {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -254,8 +267,15 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    if( two_row ) {
      Gauge3x2unmunger<fobj2D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    } else {
      GaugeSimpleUnmunger<fobj3D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -288,7 +308,6 @@ public:
    MachineCharacteristics(header);
    uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/perfmon/PerfCount.h>
 #include <Grid/perfmon/Timer.h>
 #include <Grid/perfmon/PerfCount.h>
 NAMESPACE_BEGIN(Grid);
 GridTimePoint theProgramStart = GridClock::now();
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 #define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
@@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_PERFCOUNT_H
 #define GRID_PERFCOUNT_H
 #ifndef __SSC_START
 #define __SSC_START
 #define __SSC_STOP
 #endif
 #include <sys/time.h>
 #include <ctime>
 #include <chrono>
@@ -72,17 +78,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
@@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid)
-// Dress the output; use std::chrono
+//typedef  std::chrono::system_clock          GridClock;
-// C++11 time facilities better?
+typedef  std::chrono::high_resolution_clock   GridClock;
 inline double usecond(void) {
  struct timeval tv;
 #ifdef TIMERS_ON
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::seconds               GridSecs;
@@ -53,6 +44,15 @@ typedef  std::chrono::milliseconds          GridMillisecs;
 typedef  std::chrono::microseconds          GridUsecs;
 typedef  std::chrono::microseconds          GridTime;
 extern GridTimePoint theProgramStart;
 // Dress the output; use std::chrono
 // C++11 time facilities better?
 inline double usecond(void) {
  auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart); 
  return 1.0*usecs.count();
 }
 inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
  stream << time.count()<<" s";
@@ -0,0 +1,70 @@
 #pragma once
 NAMESPACE_BEGIN(Grid);
 #ifdef GRID_TRACING_NVTX
 #include <nvToolsExt.h>
 class GridTracer {
 public:
  GridTracer(const char* name) {
    nvtxRangePushA(name);
  }
  ~GridTracer() {
    nvtxRangePop();
  }
 };
 inline void tracePush(const char *name) { nvtxRangePushA(name); }
 inline void tracePop(const char *name) { nvtxRangePop(); }
 inline int  traceStart(const char *name) {  }
 inline void traceStop(int ID) {  }
 #endif
 #ifdef GRID_TRACING_ROCTX
 #include <roctracer/roctx.h>
 class GridTracer {
 public:
  GridTracer(const char* name) {
    roctxRangePushA(name);
    std::cout << "roctxRangePush "<<name<<std::endl;
  }
  ~GridTracer() {
    roctxRangePop();
    std::cout << "roctxRangePop "<<std::endl;
  }
 };
 inline void tracePush(const char *name) { roctxRangePushA(name); }
 inline void tracePop(const char *name) { roctxRangePop(); }
 inline int  traceStart(const char *name) { roctxRangeStart(name); }
 inline void traceStop(int ID) { roctxRangeStop(ID); }
 #endif
 #ifdef GRID_TRACING_TIMER
 class GridTracer {
 public:
  const char *name;
  double elapsed;
  GridTracer(const char* _name) {
    name = _name;
    elapsed=-usecond();
  }
  ~GridTracer() {
    elapsed+=usecond();
    std::cout << GridLogTracing << name << " took " <<elapsed<< " us" <<std::endl;
  }
 };
 inline void tracePush(const char *name) {  }
 inline void tracePop(const char *name) {  }
 inline int  traceStart(const char *name) { return 0; }
 inline void traceStop(int ID) {  }
 #endif
 #ifdef GRID_TRACING_NONE
 #define GRID_TRACE(name) 
 inline void tracePush(const char *name) {  }
 inline void tracePop(const char *name) {  }
 inline int  traceStart(const char *name) { return 0;  }
 inline void traceStop(int ID) {  }
 #else
 #define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
 #endif
 NAMESPACE_END(Grid);
@@ -16,8 +16,12 @@
 #ifdef __NVCC__
 #pragma push
 #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
 #endif
 #include "pugixml.h"
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
 #define ColourIndex  (2)
 #define SpinIndex    (1)
 #define LorentzIndex (0)
 #define GparityFlavourIndex (0)
 // Also should make these a named enum type
 static constexpr int DaggerNo=0;
@@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
 template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
 template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
 const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 // ChrisK very keen to add extra space for Gparity doubling.
 //
 // Also add domain wall index, in a way where Wilson operator 
@@ -101,6 +104,7 @@ template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iSca
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
 template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
 template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
 template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ;
 template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
 template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
@@ -110,8 +114,10 @@ template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVec
    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
 template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
 template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
@@ -121,6 +127,7 @@ typedef iSpinMatrix<ComplexD >          SpinMatrixD;
 typedef iSpinMatrix<vComplex >          vSpinMatrix;
 typedef iSpinMatrix<vComplexF>          vSpinMatrixF;
 typedef iSpinMatrix<vComplexD>          vSpinMatrixD;
 typedef iSpinMatrix<vComplexD2>         vSpinMatrixD2;
 // Colour Matrix
 typedef iColourMatrix<Complex  >        ColourMatrix;
@@ -130,6 +137,7 @@ typedef iColourMatrix<ComplexD >        ColourMatrixD;
 typedef iColourMatrix<vComplex >        vColourMatrix;
 typedef iColourMatrix<vComplexF>        vColourMatrixF;
 typedef iColourMatrix<vComplexD>        vColourMatrixD;
 typedef iColourMatrix<vComplexD2>       vColourMatrixD2;
 // SpinColour matrix
 typedef iSpinColourMatrix<Complex  >    SpinColourMatrix;
@@ -139,6 +147,7 @@ typedef iSpinColourMatrix<ComplexD >    SpinColourMatrixD;
 typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
 typedef iSpinColourMatrix<vComplexD2>   vSpinColourMatrixD2;
 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@@ -148,6 +157,7 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;
 // SpinColourSpinColour matrix
 typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
@@ -157,6 +167,7 @@ typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
 typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
 typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
 typedef iSpinColourSpinColourMatrix<vComplexD2>   vSpinColourSpinColourMatrixD2;
 // LorentzColour
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
@@ -166,6 +177,16 @@ typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
 typedef iLorentzColourMatrix<vComplex >  vLorentzColourMatrix;
 typedef iLorentzColourMatrix<vComplexF>  vLorentzColourMatrixF;
 typedef iLorentzColourMatrix<vComplexD>  vLorentzColourMatrixD;
 typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
 // LorentzComplex
 typedef iLorentzComplex<Complex  > LorentzComplex;
 typedef iLorentzComplex<ComplexF > LorentzComplexF;
 typedef iLorentzComplex<ComplexD > LorentzComplexD;
 typedef iLorentzComplex<vComplex > vLorentzComplex;
 typedef iLorentzComplex<vComplexF> vLorentzComplexF;
 typedef iLorentzComplex<vComplexD> vLorentzComplexD;
 // DoubleStored gauge field
 typedef iDoubleStoredColourMatrix<Complex  > DoubleStoredColourMatrix;
@@ -175,6 +196,18 @@ typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
 typedef iDoubleStoredColourMatrix<vComplex >  vDoubleStoredColourMatrix;
 typedef iDoubleStoredColourMatrix<vComplexF>  vDoubleStoredColourMatrixF;
 typedef iDoubleStoredColourMatrix<vComplexD>  vDoubleStoredColourMatrixD;
 typedef iDoubleStoredColourMatrix<vComplexD2> vDoubleStoredColourMatrixD2;
 //G-parity flavour matrix
 typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
 typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplex>   vGparityFlavourMatrix;
 typedef iGparityFlavourMatrix<vComplexF>  vGparityFlavourMatrixF;
 typedef iGparityFlavourMatrix<vComplexD>  vGparityFlavourMatrixD;
 typedef iGparityFlavourMatrix<vComplexD2> vGparityFlavourMatrixD2;
 // Spin vector
 typedef iSpinVector<Complex >           SpinVector;
@@ -184,6 +217,7 @@ typedef iSpinVector<ComplexD>           SpinVectorD;
 typedef iSpinVector<vComplex >           vSpinVector;
 typedef iSpinVector<vComplexF>           vSpinVectorF;
 typedef iSpinVector<vComplexD>           vSpinVectorD;
 typedef iSpinVector<vComplexD2>          vSpinVectorD2;
 // Colour vector
 typedef iColourVector<Complex >         ColourVector;
@@ -193,6 +227,7 @@ typedef iColourVector<ComplexD>         ColourVectorD;
 typedef iColourVector<vComplex >         vColourVector;
 typedef iColourVector<vComplexF>         vColourVectorF;
 typedef iColourVector<vComplexD>         vColourVectorD;
 typedef iColourVector<vComplexD2>        vColourVectorD2;
 // SpinColourVector
 typedef iSpinColourVector<Complex >     SpinColourVector;
@@ -202,6 +237,7 @@ typedef iSpinColourVector<ComplexD>     SpinColourVectorD;
 typedef iSpinColourVector<vComplex >     vSpinColourVector;
 typedef iSpinColourVector<vComplexF>     vSpinColourVectorF;
 typedef iSpinColourVector<vComplexD>     vSpinColourVectorD;
 typedef iSpinColourVector<vComplexD2>    vSpinColourVectorD2;
 // HalfSpin vector
 typedef iHalfSpinVector<Complex >       HalfSpinVector;
@@ -211,6 +247,7 @@ typedef iHalfSpinVector<ComplexD>       HalfSpinVectorD;
 typedef iHalfSpinVector<vComplex >       vHalfSpinVector;
 typedef iHalfSpinVector<vComplexF>       vHalfSpinVectorF;
 typedef iHalfSpinVector<vComplexD>       vHalfSpinVectorD;
 typedef iHalfSpinVector<vComplexD2>      vHalfSpinVectorD2;
 // HalfSpinColour vector
 typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
@@ -220,6 +257,17 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplex >  vHalfSpinColourVector;
 typedef iHalfSpinColourVector<vComplexF>  vHalfSpinColourVectorF;
 typedef iHalfSpinColourVector<vComplexD>  vHalfSpinColourVectorD;
 typedef iHalfSpinColourVector<vComplexD2> vHalfSpinColourVectorD2;
 //G-parity flavour vector
 typedef iGparityFlavourVector<Complex >         GparityFlavourVector;
 typedef iGparityFlavourVector<ComplexF>         GparityFlavourVectorF;
 typedef iGparityFlavourVector<ComplexD>         GparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplex >         vGparityFlavourVector;
 typedef iGparityFlavourVector<vComplexF>         vGparityFlavourVectorF;
 typedef iGparityFlavourVector<vComplexD>         vGparityFlavourVectorD;
 typedef iGparityFlavourVector<vComplexD2>        vGparityFlavourVectorD2;
 // singlets
 typedef iSinglet<Complex >         TComplex;     // FIXME This is painful. Tensor singlet complex type.
@@ -229,6 +277,7 @@ typedef iSinglet<ComplexD>         TComplexD;    // FIXME This is painful. Tenso
 typedef iSinglet<vComplex >        vTComplex ;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexF>        vTComplexF;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexD>        vTComplexD;   // what if we don't know the tensor structure
 typedef iSinglet<vComplexD2>       vTComplexD2;   // what if we don't know the tensor structure
 typedef iSinglet<Real >            TReal;        // Shouldn't need these; can I make it work without?
 typedef iSinglet<RealF>            TRealF;       // Shouldn't need these; can I make it work without?
@@ -246,47 +295,62 @@ typedef iSinglet<Integer >         TInteger;
 typedef Lattice<vColourMatrix>          LatticeColourMatrix;
 typedef Lattice<vColourMatrixF>         LatticeColourMatrixF;
 typedef Lattice<vColourMatrixD>         LatticeColourMatrixD;
 typedef Lattice<vColourMatrixD2>        LatticeColourMatrixD2;
 typedef Lattice<vSpinMatrix>            LatticeSpinMatrix;
 typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
 typedef Lattice<vSpinMatrixD>           LatticeSpinMatrixD;
 typedef Lattice<vSpinMatrixD2>          LatticeSpinMatrixD2;
 typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
 typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
 typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
 typedef Lattice<vSpinColourMatrixD2>    LatticeSpinColourMatrixD2;
 typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
 typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
 typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
 typedef Lattice<vSpinColourSpinColourMatrixD2>    LatticeSpinColourSpinColourMatrixD2;
 typedef Lattice<vLorentzColourMatrix>   LatticeLorentzColourMatrix;
 typedef Lattice<vLorentzColourMatrixF>  LatticeLorentzColourMatrixF;
 typedef Lattice<vLorentzColourMatrixD>  LatticeLorentzColourMatrixD;
 typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
 typedef Lattice<vLorentzComplex>  LatticeLorentzComplex;
 typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
 typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
 // DoubleStored gauge field
 typedef Lattice<vDoubleStoredColourMatrix>   LatticeDoubleStoredColourMatrix;
 typedef Lattice<vDoubleStoredColourMatrixF>  LatticeDoubleStoredColourMatrixF;
 typedef Lattice<vDoubleStoredColourMatrixD>  LatticeDoubleStoredColourMatrixD;
 typedef Lattice<vDoubleStoredColourMatrixD2> LatticeDoubleStoredColourMatrixD2;
 typedef Lattice<vSpinVector>            LatticeSpinVector;
 typedef Lattice<vSpinVectorF>           LatticeSpinVectorF;
 typedef Lattice<vSpinVectorD>           LatticeSpinVectorD;
 typedef Lattice<vSpinVectorD2>          LatticeSpinVectorD2;
 typedef Lattice<vColourVector>          LatticeColourVector;
 typedef Lattice<vColourVectorF>         LatticeColourVectorF;
 typedef Lattice<vColourVectorD>         LatticeColourVectorD;
 typedef Lattice<vColourVectorD2>        LatticeColourVectorD2;
 typedef Lattice<vSpinColourVector>      LatticeSpinColourVector;
 typedef Lattice<vSpinColourVectorF>     LatticeSpinColourVectorF;
 typedef Lattice<vSpinColourVectorD>     LatticeSpinColourVectorD;
 typedef Lattice<vSpinColourVectorD2>    LatticeSpinColourVectorD2;
 typedef Lattice<vHalfSpinVector>        LatticeHalfSpinVector;
 typedef Lattice<vHalfSpinVectorF>       LatticeHalfSpinVectorF;
 typedef Lattice<vHalfSpinVectorD>       LatticeHalfSpinVectorD;
 typedef Lattice<vHalfSpinVectorD2>      LatticeHalfSpinVectorD2;
 typedef Lattice<vHalfSpinColourVector>   LatticeHalfSpinColourVector;
 typedef Lattice<vHalfSpinColourVectorF>  LatticeHalfSpinColourVectorF;
 typedef Lattice<vHalfSpinColourVectorD>  LatticeHalfSpinColourVectorD;
 typedef Lattice<vHalfSpinColourVectorD2> LatticeHalfSpinColourVectorD2;
 typedef Lattice<vTReal>            LatticeReal;
 typedef Lattice<vTRealF>           LatticeRealF;
@@ -295,6 +359,7 @@ typedef Lattice<vTRealD>           LatticeRealD;
 typedef Lattice<vTComplex>         LatticeComplex;
 typedef Lattice<vTComplexF>        LatticeComplexF;
 typedef Lattice<vTComplexD>        LatticeComplexD;
 typedef Lattice<vTComplexD2>       LatticeComplexD2;
 typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
@@ -304,35 +369,40 @@ typedef Lattice<vTInteger>         LatticeInteger; // Predicates for "where"
 ///////////////////////////////////////////
 typedef LatticeHalfSpinColourVector   LatticeHalfFermion;
 typedef LatticeHalfSpinColourVectorF  LatticeHalfFermionF;
-typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
+typedef LatticeHalfSpinColourVectorD  LatticeHalfFermionD;
 typedef LatticeHalfSpinColourVectorD2 LatticeHalfFermionD2;
 typedef LatticeSpinColourVector      LatticeFermion;
 typedef LatticeSpinColourVectorF     LatticeFermionF;
 typedef LatticeSpinColourVectorD     LatticeFermionD;
 typedef LatticeSpinColourVectorD2    LatticeFermionD2;
 typedef LatticeSpinColourMatrix                LatticePropagator;
 typedef LatticeSpinColourMatrixF               LatticePropagatorF;
 typedef LatticeSpinColourMatrixD               LatticePropagatorD;
 typedef LatticeSpinColourMatrixD2              LatticePropagatorD2;
 typedef LatticeLorentzColourMatrix             LatticeGaugeField;
 typedef LatticeLorentzColourMatrixF            LatticeGaugeFieldF;
 typedef LatticeLorentzColourMatrixD            LatticeGaugeFieldD;
 typedef LatticeLorentzColourMatrixD2           LatticeGaugeFieldD2;
 typedef LatticeDoubleStoredColourMatrix        LatticeDoubledGaugeField;
 typedef LatticeDoubleStoredColourMatrixF       LatticeDoubledGaugeFieldF;
 typedef LatticeDoubleStoredColourMatrixD       LatticeDoubledGaugeFieldD;
 typedef LatticeDoubleStoredColourMatrixD2      LatticeDoubledGaugeFieldD2;
 template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
 // Uhgg... typing this hurt  ;)
 // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
 typedef Lattice<vColourVector>          LatticeStaggeredFermion;    
 typedef Lattice<vColourVectorF>         LatticeStaggeredFermionF;    
 typedef Lattice<vColourVectorD>         LatticeStaggeredFermionD;    
 typedef Lattice<vColourVectorD2>        LatticeStaggeredFermionD2;    
 typedef Lattice<vColourMatrix>          LatticeStaggeredPropagator; 
 typedef Lattice<vColourMatrixF>         LatticeStaggeredPropagatorF; 
 typedef Lattice<vColourMatrixD>         LatticeStaggeredPropagatorD; 
 typedef Lattice<vColourMatrixD2>        LatticeStaggeredPropagatorD2; 
 //////////////////////////////////////////////////////////////////////////////
 // Peek and Poke named after physics attributes
@@ -451,9 +521,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
 //template <class Prop, class Ferm>
 #define FAST_FERM_TO_PROP
 template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
  autoView(p_v,p,CpuWrite);
  autoView(f_v,f,CpuRead);
  thread_for(idx,p_v.oSites(),{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
      }}
    });
 #else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -465,12 +546,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 	}
      pokeSpin(p, pjs, j, s);
    }
 #endif
 }
 //template <class Prop, class Ferm>
 template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
  autoView(p_v,p,CpuRead);
  autoView(f_v,f,CpuWrite);
  thread_for(idx,p_v.oSites(),{
      for(int ss = 0; ss < Ns; ++ss) {
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
      }}
    });
 #else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -482,6 +574,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 	}
      pokeSpin(f, fj, j);
    }
 #endif
 }
 //////////////////////////////////////////////
@@ -34,16 +34,96 @@ directory
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////
 // Smart configuration base class
 ///////////////////////////////////
 template< class Field >
 class ConfigurationBase
 {
 public:
  ConfigurationBase() {}
  virtual ~ConfigurationBase() {}
  virtual void set_Field(Field& U) =0;
  virtual void smeared_force(Field&) const = 0;
  virtual Field& get_SmearedU() =0;
  virtual Field &get_U(bool smeared = false) = 0;
 };
 template <class GaugeField >
 class Action 
 {
 public:
  bool is_smeared = false;
  RealD deriv_norm_sum;
  RealD deriv_max_sum;
  RealD Fdt_norm_sum;
  RealD Fdt_max_sum;
  int   deriv_num;
  RealD deriv_us;
  RealD S_us;
  RealD refresh_us;
  void  reset_timer(void)        {
    deriv_us = S_us = refresh_us = 0.0;
    deriv_norm_sum = deriv_max_sum=0.0;
    Fdt_max_sum =  Fdt_norm_sum = 0.0;
    deriv_num=0;
  }
  void  deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
    if ( max > deriv_max_sum ) {
      deriv_max_sum=max;
    }
    deriv_norm_sum+=nrm;
    if ( Fdt_max > Fdt_max_sum ) {
      Fdt_max_sum=Fdt_max;
    }
    Fdt_norm_sum+=Fdt_nrm; deriv_num++;
  }
  RealD deriv_max_average(void)       { return deriv_max_sum; };
  RealD deriv_norm_average(void)      { return deriv_norm_sum/deriv_num; };
  RealD Fdt_max_average(void)         { return Fdt_max_sum; };
  RealD Fdt_norm_average(void)        { return Fdt_norm_sum/deriv_num; };
  RealD deriv_timer(void)        { return deriv_us; };
  RealD S_timer(void)            { return S_us; };
  RealD refresh_timer(void)      { return refresh_us; };
  void deriv_timer_start(void)   { deriv_us-=usecond(); }
  void deriv_timer_stop(void)    { deriv_us+=usecond(); }
  void refresh_timer_start(void) { refresh_us-=usecond(); }
  void refresh_timer_stop(void)  { refresh_us+=usecond(); }
  void S_timer_start(void)       { S_us-=usecond(); }
  void S_timer_stop(void)        { S_us+=usecond(); }
  /////////////////////////////
  // Heatbath?
  /////////////////////////////
  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  /////////////////////////////////////////////////////////////
  // virtual smeared interface through configuration container
  /////////////////////////////////////////////////////////////
  virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
  {
    refresh(U.get_U(is_smeared),sRNG,pRNG);
  }
  virtual RealD S(ConfigurationBase<GaugeField>& U)
  {
    return S(U.get_U(is_smeared));
  }
  virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
  {
    return Sinitial(U.get_U(is_smeared));
  }
  virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
  {
    deriv(U.get_U(is_smeared),dSdU); 
    if ( is_smeared ) {
      U.smeared_force(dSdU);
    }
  }
  ///////////////////////////////
  // Logging
  ///////////////////////////////
  virtual std::string action_name()    = 0;                             // return the action name
  virtual std::string LogParameters()  = 0;                             // prints action parameters
  virtual ~Action(){}
@@ -30,6 +30,8 @@ directory
 #ifndef QCD_ACTION_CORE
 #define QCD_ACTION_CORE
 #include <Grid/qcd/action/gauge/GaugeImplementations.h>
 #include <Grid/qcd/action/ActionBase.h>
 NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
@@ -37,6 +39,10 @@ NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
 NAMESPACE_CHECK(ActionParams);
 #include <Grid/qcd/action/filters/MomentumFilter.h>
 #include <Grid/qcd/action/filters/DirichletFilter.h>
 #include <Grid/qcd/action/filters/DDHMCFilter.h>
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
@@ -34,27 +34,45 @@ directory
 NAMESPACE_BEGIN(Grid);
-// These can move into a params header and be given MacroMagic serialisation
+
 struct GparityWilsonImplParams {
  Coordinate twists;
-  GparityWilsonImplParams() : twists(Nd, 0) {};
+                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  GparityWilsonImplParams() : twists(Nd, 0) {
    dirichlet.resize(0);
    partialDirichlet=0;
  };
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    dirichlet.resize(0);
    partialDirichlet=0;
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    partialDirichlet=0;
    dirichlet.resize(0);
  }
 };
 struct StaggeredImplParams {
-  StaggeredImplParams()  {};
+  Coordinate dirichlet; // Blocksize of dirichlet BCs
  int  partialDirichlet;
  StaggeredImplParams()
  {
    partialDirichlet=0;
    dirichlet.resize(0);
  };
 };
  struct OneFlavourRationalParams : Serializable {
@@ -63,9 +81,11 @@ struct StaggeredImplParams {
 				    RealD, hi, 
 				    int,   MaxIter, 
 				    RealD, tolerance, 
 				    RealD, mdtolerance, 
 				    int,   degree, 
 				    int,   precision,
-				    int,   BoundsCheckFreq);
+				    int,   BoundsCheckFreq,
 				    RealD, BoundsCheckTol);
  // MaxIter and tolerance, vectors??
@@ -76,16 +96,62 @@ struct StaggeredImplParams {
 				RealD tol      = 1.0e-8, 
                           	int _degree    = 10,
 				int _precision = 64,
-				int _BoundsCheckFreq=20)
+				int _BoundsCheckFreq=20,
 				RealD mdtol    = 1.0e-6,
 				double _BoundsCheckTol=1e-6)
      : lo(_lo),
 	hi(_hi),
 	MaxIter(_maxit),
 	tolerance(tol),
        mdtolerance(mdtol),
 	degree(_degree),
        precision(_precision),
        BoundsCheckFreq(_BoundsCheckFreq),
        BoundsCheckTol(_BoundsCheckTol){};
  };
  /*Action parameters for the generalized rational action
    The approximation is for (M^dag M)^{1/inv_pow}
    where inv_pow is the denominator of the fractional power.
    Default inv_pow=2 for square root, making this equivalent to 
    the OneFlavourRational action
  */
    struct RationalActionParams : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, 
 				    int, inv_pow, 
 				    RealD, lo, //low eigenvalue bound of rational approx
 				    RealD, hi, //high eigenvalue bound of rational approx
 				    int,   MaxIter,  //maximum iterations in msCG
 				    RealD, action_tolerance,  //msCG tolerance in action evaluation
 				    int,   action_degree, //rational approx tolerance in action evaluation
 				    RealD, md_tolerance,  //msCG tolerance in MD integration
 				    int,   md_degree, //rational approx tolerance in MD integration
 				    int,   precision, //precision of floating point arithmetic
 				    int,   BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
  // constructor 
  RationalActionParams(int _inv_pow = 2,
 		       RealD _lo      = 0.0, 
 		       RealD _hi      = 1.0, 
 		       int _maxit     = 1000,
 		       RealD _action_tolerance      = 1.0e-8, 
 		       int _action_degree    = 10,
 		       RealD _md_tolerance      = 1.0e-8, 
 		       int _md_degree    = 10,
 		       int _precision = 64,
 		       int _BoundsCheckFreq=20)
    : inv_pow(_inv_pow), 
      lo(_lo),
      hi(_hi),
      MaxIter(_maxit),
      action_tolerance(_action_tolerance),
      action_degree(_action_degree),
      md_tolerance(_md_tolerance),
      md_degree(_md_degree),
      precision(_precision),
      BoundsCheckFreq(_BoundsCheckFreq){};
  };
 NAMESPACE_END(Grid);
 #endif
@@ -68,9 +68,17 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  RealD Mass(void) { return mass; };
+  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
-    mass=_mass; 
+    mass_plus=mass_minus=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
    mass_plus=_mass_plus;
    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@@ -108,7 +116,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
  //    protected:
-  RealD mass;
+  RealD mass_plus, mass_minus;
  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
@@ -175,16 +183,6 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
  void CayleyReport(void);
  void CayleyZeroCounters(void);
  double M5Dflops;
  double M5Dcalls;
  double M5Dtime;
  double MooeeInvFlops;
  double MooeeInvCalls;
  double MooeeInvTime;
 protected:
  virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
@@ -0,0 +1,334 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 ////////////////////////////////////////////
 // Standard Clover
 //   (4+m0) + csw * clover_term
 // Exp Clover
 //   (4+m0) * exp(csw/(4+m0) clover_term)
 //   = (4+m0) + csw * clover_term + ...
 ////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////
 // Generic Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
    GridBase *grid = CloverTerm.Grid();
    CloverTerm += diag_mass;
    int lvol = grid->lSites();
    int DimRep = Impl::Dimension;
    {
      autoView(CTv,CloverTerm,CpuRead);
      autoView(CTIv,CloverTermInv,CpuWrite);
      thread_for(site, lvol, {
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
        peekLocalSite(Qx, CTv, lcoor);
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++){
                auto zz =  Qx()(j, k)(a, b);
                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
              }
        EigenInvCloverOp = EigenCloverOp.inverse();
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++)
                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
               pokeLocalSite(Qxinv, CTIv, lcoor);
      });
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Generic Exp Clover
 //////////////////////////////////
 template<class Impl>
 class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef WilsonCloverHelpers<Impl> Helpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD2>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 //////////////////////////////////
 // Compact Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
                            public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    Clover += diag_mass;
  }
  static void InvertClover(CloverField& InvClover,
                            const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverDiagonalField&       diagonalInv,
                            CloverTriangleField&       triangleInv,
                            bool fixedBoundaries) {
    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
  }
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Compact Exp Clover
 //////////////////////////////////
 template<class Impl>
 class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static void InvertClover(CloverField& InvClover,
                            const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverDiagonalField&       diagonalInv,
                            CloverTriangleField&       triangleInv,
                            bool fixedBoundaries) {
    if (fixedBoundaries)
    {
      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
    }
    else
    {
      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,241 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
    Copyright (C) 2020 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Nils Meyer <nils.meyer@ur.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 // see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
 //
 // Modifications done here:
 //
 // Original: clover term = 12x12 matrix per site
 //
 // But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
 // Sufficient to store/transfer only the real parts of the diagonal and one triangular part
 // 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
 //
 // Here: Above but diagonal as complex numbers, i.e., need to store/transfer
 // 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
 //
 // Words per site and improvement compared to original (combined with the input and output spinors):
 //
 // - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
 // - Minimal:  2*12 + 36    =  60 words -> 2.80 x less
 // - Here:     2*12 + 42    =  66 words -> 2.55 x less
 //
 // These improvements directly translate to wall-clock time
 //
 // Data layout:
 //
 // - diagonal and triangle part as separate lattice fields,
 //   this was faster than as 1 combined field on all tested machines
 // - diagonal: as expected
 // - triangle: store upper right triangle in row major order
 // - graphical:
 //        0  1  2  3  4
 //           5  6  7  8
 //              9 10 11 = upper right triangle indices
 //                12 13
 //                   14
 //     0
 //        1
 //           2
 //              3       = diagonal indices
 //                 4
 //                    5
 //     0
 //     1  5
 //     2  6  9          = lower left triangle indices
 //     3  7 10 12
 //     4  8 11 13 14
 //
 // Impact on total memory consumption:
 // - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
 // - Here:     (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts =  24 complex words per site
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
 template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
  /////////////////////////////////////////////
  // Sizes
  /////////////////////////////////////////////
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  /////////////////////////////////////////////
  // Type definitions
  /////////////////////////////////////////////
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonFermion<Impl>              WilsonBase;
  typedef WilsonCloverHelpers<Impl>        Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  /////////////////////////////////////////////
  // Constructors
  /////////////////////////////////////////////
 public:
  CompactWilsonCloverFermion(GaugeField& _Umu,
 			    GridCartesian& Fgrid,
 			    GridRedBlackCartesian& Hgrid,
 			    const RealD _mass,
 			    const RealD _csw_r = 0.0,
 			    const RealD _csw_t = 0.0,
 			    const RealD _cF = 1.0,
 			    const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
 			    const ImplParams& impl_p = ImplParams());
  /////////////////////////////////////////////
  // Member functions (implementing interface)
  /////////////////////////////////////////////
 public:
  virtual void Instantiatable() {};
  int          ConstEE()     override { return 0; };
  int          isTrivialEE() override { return 0; };
  void Dhop(const FermionField& in, FermionField& out, int dag) override;
  void DhopOE(const FermionField& in, FermionField& out, int dag) override;
  void DhopEO(const FermionField& in, FermionField& out, int dag) override;
  void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
  void M(const FermionField& in, FermionField& out) override;
  void Mdag(const FermionField& in, FermionField& out) override;
  void Meooe(const FermionField& in, FermionField& out) override;
  void MeooeDag(const FermionField& in, FermionField& out) override;
  void Mooee(const FermionField& in, FermionField& out) override;
  void MooeeDag(const FermionField& in, FermionField& out) override;
  void MooeeInv(const FermionField& in, FermionField& out) override;
  void MooeeInvDag(const FermionField& in, FermionField& out) override;
  void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
  void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
  void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
  void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
  /////////////////////////////////////////////
  // Member functions (internals)
  /////////////////////////////////////////////
  void MooeeInternal(const FermionField&        in,
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle);
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
  void ImportGauge(const GaugeField& _Umu) override;
  /////////////////////////////////////////////
  // Helpers
  /////////////////////////////////////////////
 private:
  template<class Field>
  const MaskField* getCorrectMaskField(const Field &in) const {
    if(in.Grid()->_isCheckerBoarded) {
      if(in.Checkerboard() == Odd) {
        return &this->BoundaryMaskOdd;
      } else {
        return &this->BoundaryMaskEven;
      }
    } else {
      return &this->BoundaryMask;
    }
  }
  template<class Field>
  void ApplyBoundaryMask(Field& f) {
    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
    assert(m != nullptr);
    CompactHelpers::ApplyBoundaryMask(f, *m);
  }
  /////////////////////////////////////////////
  // Member Data
  /////////////////////////////////////////////
 public:
  RealD csw_r;
  RealD csw_t;
  RealD cF;
  bool fixedBoundaries;
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
  CloverTriangleField Triangle,    TriangleEven,    TriangleOdd;
  CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
  FermionField Tmp;
  MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,291 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DWFSlow.h
 Copyright (C) 2022
 Author: Peter Boyle <pboyle@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template <class Impl>
 class DWFSlowFermion : public FermionOperator<Impl>
 {
 public:
  INHERIT_IMPL_TYPES(Impl);
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
  GridBase *GaugeGrid(void) { return _grid4; }
  GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; }
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
  virtual void  M(const FermionField &in, FermionField &out)
  {
    FermionField tmp(_grid);
    out = (5.0 - M5) * in;
    Dhop(in,tmp,DaggerNo);
    out = out + tmp;
  }
  virtual void  Mdag(const FermionField &in, FermionField &out)
  {
    FermionField tmp(_grid);
    out = (5.0 - M5) * in;
    Dhop(in,tmp,DaggerYes);
    out = out + tmp;
  };
  /////////////////////////////////////////////////////////
  // half checkerboard operations 5D redblack so just site identiy
  /////////////////////////////////////////////////////////
  void Meooe(const FermionField &in, FermionField &out)
  {
    if ( in.Checkerboard() == Odd ) {
      this->DhopEO(in,out,DaggerNo);
    } else {
      this->DhopOE(in,out,DaggerNo);
    }
  }
  void MeooeDag(const FermionField &in, FermionField &out)
  {
    if ( in.Checkerboard() == Odd ) {
      this->DhopEO(in,out,DaggerYes);
    } else {
      this->DhopOE(in,out,DaggerYes);
    }
  };
  // allow override for twisted mass and clover
  virtual void Mooee(const FermionField &in, FermionField &out)
  {
    out = (5.0 - M5) * in;
  }
  virtual void MooeeDag(const FermionField &in, FermionField &out)
  {
    out = (5.0 - M5) * in;
  }
  virtual void MooeeInv(const FermionField &in, FermionField &out)
  {
    out = (1.0/(5.0 - M5)) * in;
  };
  virtual void MooeeInvDag(const FermionField &in, FermionField &out)
  {
    out = (1.0/(5.0 - M5)) * in;
  };
  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) {} ;
  ////////////////////////
  // Derivative interface
  ////////////////////////
  // Interface calls an internal routine
  void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)  { assert(0);};
  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
  ///////////////////////////////////////////////////////////////
  // non-hermitian hopping term; half cb or both
  ///////////////////////////////////////////////////////////////
  void Dhop(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    Dhop5(in,out,MassField,MassField,dag );
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  void DhopOE(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    assert(in.Checkerboard()==Even);
    Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  void DhopEO(const FermionField &in, FermionField &out, int dag)
  {
    FermionField tmp(in.Grid());
    assert(in.Checkerboard()==Odd);
    Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );  
    for(int mu=0;mu<4;mu++){
      DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag );    out = out + tmp;
    }
  };
  ///////////////////////////////////////////////////////////////
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
  void MdirAll(const FermionField &in, std::vector<FermionField> &out)   { assert(0);};
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out)    { assert(0);};
  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
  void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
  {
    RealD     sgn= 1.0;
    if (dag ) sgn=-1.0;
    Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaX,
 			 Gamma::Algebra::GammaY,
 			 Gamma::Algebra::GammaZ,
 			 Gamma::Algebra::GammaT
    };
    //    mass is  1,1,1,1,-m has to multiply the round the world term
    FermionField tmp (in.Grid());
    tmp = U5e * Cshift(in,mu+1,1);
    out = tmp - Gamma(Gmu[mu])*tmp*sgn;
    tmp = Cshift(adj(U5o)*in,mu+1,-1);
    out = out + tmp + Gamma(Gmu[mu])*tmp*sgn;
    out = -0.5*out;
  };
  void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag)
  {
    // Mass term.... must multiple the round world with mass = 1,1,1,1, -m
    RealD     sgn= 1.0;
    if (dag ) sgn=-1.0;
    Gamma G5(Gamma::Algebra::Gamma5);
    FermionField tmp (in.Grid());
    tmp = massE*Cshift(in,0,1);
    out = tmp - G5*tmp*sgn;
    tmp = Cshift(massO*in,0,-1);
    out = out + tmp + G5*tmp*sgn;
    out = -0.5*out;
  };
  // Constructor
  DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 		 GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5)
    :
    _grid(&Fgrid),
    _cbgrid(&Hgrid),
    _grid4(_Umu.Grid()),
    Umu(Nd,&Fgrid),
    UmuEven(Nd,&Hgrid),
    UmuOdd(Nd,&Hgrid),
    MassField(&Fgrid),
    MassFieldEven(&Hgrid),
    MassFieldOdd(&Hgrid),
    M5(_M5),
    mass(_mass),
    _tmp(&Hgrid)
    {
      Ls=Fgrid._fdimensions[0];
      ImportGauge(_Umu);
      typedef typename FermionField::scalar_type scalar;
      Lattice<iScalar<vInteger> > coor(&Fgrid);
      LatticeCoordinate(coor, 0); // Scoor
      ComplexField one(&Fgrid);
      MassField =scalar(-mass);
      one       =scalar(1.0);
      MassField =where(coor==Integer(Ls-1),MassField,one);
      for(int mu=0;mu<Nd;mu++){
 	pickCheckerboard(Even,UmuEven[mu],Umu[mu]);
 	pickCheckerboard(Odd ,UmuOdd[mu],Umu[mu]);
      }
      pickCheckerboard(Even,MassFieldEven,MassField);
      pickCheckerboard(Odd ,MassFieldOdd,MassField);
    }
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu4)
  {
    GaugeLinkField U4(_grid4);
    for(int mu=0;mu<Nd;mu++){
      U4 = PeekIndex<LorentzIndex>(_Umu4, mu);
      for(int s=0;s<this->Ls;s++){
 	InsertSlice(U4,Umu[mu],s,0);
      }
    }
  }
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
 public:
  virtual RealD Mass(void) { return mass; }
  virtual int   isTrivialEE(void) { return 1; };
  RealD mass;
  RealD M5;
  int Ls;
  GridBase *_grid4;
  GridBase *_grid;
  GridBase *_cbgrid4;
  GridBase *_cbgrid;
  // Copy of the gauge field , with even and odd subsets
  std::vector<GaugeLinkField> Umu;
  std::vector<GaugeLinkField> UmuEven;
  std::vector<GaugeLinkField> UmuOdd;
  ComplexField MassField;
  ComplexField MassFieldEven;
  ComplexField MassFieldOdd;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
                                PropagatorField &phys_src,
                                Current curr_type,
                                unsigned int mu){}
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
                           PropagatorField &phys_src,
                           Current curr_type,
                           unsigned int mu,
                           unsigned int tmin,
 			   unsigned int tmax,
 			   ComplexField &lattice_cmplx){}
 };
 typedef DWFSlowFermion<WilsonImplF> DWFSlowFermionF;
 typedef DWFSlowFermion<WilsonImplD> DWFSlowFermionD;
 NAMESPACE_END(Grid);
@@ -47,12 +47,14 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/DWFSlow.h>       // Slow DWF
 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 NAMESPACE_CHECK(WilsonTM);
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
 NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 NAMESPACE_CHECK(Wilson5D);
@@ -111,183 +113,151 @@ NAMESPACE_CHECK(DWFutils);
 // Cayley 5d
 NAMESPACE_BEGIN(Grid);
-typedef WilsonFermion<WilsonImplR> WilsonFermionR;
+typedef WilsonFermion<WilsonImplD2> WilsonFermionD2;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 //typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
 //typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
 //typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
 typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
 typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
 // Twisted mass fermion
-typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
+typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
-typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
-typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonClover<WilsonImplD2> WilsonCloverFermionD2;
-typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
-typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonExpClover<WilsonImplD2> WilsonExpCloverFermionD2;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
 template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
 template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
 typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
 typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
 typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
 typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
 typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
 typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
 typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef DomainWallFermion<WilsonImplD2> DomainWallFermionD2;
-//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
+typedef DomainWallEOFAFermion<WilsonImplD2> DomainWallEOFAFermionD2;
 //typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 //typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
-//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
+typedef MobiusFermion<WilsonImplD2> MobiusFermionD2;
 //typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 //typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
-//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
+typedef MobiusEOFAFermion<WilsonImplD2> MobiusEOFAFermionD2;
 //typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 //typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
-//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
+typedef ZMobiusFermion<ZWilsonImplD2> ZMobiusFermionD2;
 //typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 //typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
-//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
+typedef ScaledShamirFermion<WilsonImplD2> ScaledShamirFermionD2;
 //typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 //typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 // Ls vectorised
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
-typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
+typedef MobiusZolotarevFermion<WilsonImplD2> MobiusZolotarevFermionD2;
 typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
 typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
-typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
+typedef ShamirZolotarevFermion<WilsonImplD2> ShamirZolotarevFermionD2;
 typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
 typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
-typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
+typedef OverlapWilsonCayleyTanhFermion<WilsonImplD2> OverlapWilsonCayleyTanhFermionD2;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
 typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
-typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
+typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD2> OverlapWilsonCayleyZolotarevFermionD2;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
 typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
 // Continued fraction
-typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
+typedef OverlapWilsonContFracTanhFermion<WilsonImplD2> OverlapWilsonContFracTanhFermionD2;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
 typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
-typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
+typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD2> OverlapWilsonContFracZolotarevFermionD2;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
 typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
 // Partial fraction
-typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
+typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD2> OverlapWilsonPartialFractionTanhFermionD2;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
 typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
-typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
+typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD2> OverlapWilsonPartialFractionZolotarevFermionD2;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
 typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
 // Gparity cases; partial list until tested
 typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 //typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
 //typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
 //typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
 typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
-//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
+typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionD2;
 //typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 //typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
-//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
+typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionD2;
 //typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 //typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
-//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
+typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionD2;
 //typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
 //typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
 typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
-//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
+typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionD2;
 //typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 //typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 //typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 //typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 //typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
 typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
@@ -49,6 +49,8 @@ public:
  virtual FermionField &tmp(void) = 0;
  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
@@ -30,6 +30,18 @@ directory
 NAMESPACE_BEGIN(Grid);
 /*
  Policy implementation for G-parity boundary conditions
  Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
  field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
  with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
  to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
  either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
  Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
  mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
 */
 template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
@@ -113,7 +125,7 @@ public:
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
    //Apply the links
    int f_upper = permute_lane ? 1 : 0;
@@ -139,10 +151,10 @@ public:
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
-    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
+    //If this site is an global boundary site, perform the G-parity flavor twist
-
+    if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
      if ( sl == 2 ) {
-       
+	//Only do the twist for lanes on the edge of the physical node
 	ExtractBuffer<sobj> vals(Nsimd);
 	extract(chi,vals);
@@ -197,6 +209,19 @@ public:
    reg = memory;
  }
  //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
  inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
    autoView(poke_f0_v, poke_f0, CpuRead);
    autoView(poke_f1_v, poke_f1, CpuRead);
    autoView(Uds_v, Uds, CpuWrite);
    thread_foreach(ss,poke_f0_v,{
 	Uds_v[ss](0)(mu) = poke_f0_v[ss]();
 	Uds_v[ss](1)(mu) = poke_f1_v[ss]();
      });
  }
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds.Grid(),GaugeGrid);
@@ -208,13 +233,18 @@ public:
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-    for(int mu=0;mu<Nd;mu++){
+    //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. 
    //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs        
    for(int mu=0;mu<Nd-1;mu++){
      if( Params.twists[mu] ){
 	LatticeCoordinate(coor,mu);
      }
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      // Implement the isospin rotation sign on the boundary between f=1 and f=0
      // This phase could come from a simple bc 1,1,-1,1 ..
      int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
      if ( Params.twists[mu] ) { 
@@ -260,6 +290,38 @@ public:
        });
      }
    }
    { //periodic / antiperiodic temporal BCs
      int mu = Nd-1;
      int L   = GaugeGrid->GlobalDimensions()[mu];
      int Lmu = L - 1;
      LatticeCoordinate(coor, mu);
      U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
      GaugeLinkField *Upoke = &U;
      if(Params.twists[mu]){ //antiperiodic
 	Utmp =  where(coor == Lmu, -U, U);
 	Upoke = &Utmp;
      }
      Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links      
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
      //Get the barrel-shifted field
      Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
      Upoke = &Utmp;
      if(Params.twists[mu]){
 	U = where(coor == 0, -Utmp, Utmp);  //boundary phase
 	Upoke = &U;
      }
      Uconj = conjugate(*Upoke);
      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
    }
  }
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@@ -300,27 +362,47 @@ public:
  }
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
    int Ls=Btilde.Grid()->_fdimensions[0];
    int Ls = Btilde.Grid()->_fdimensions[0];
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      GridBase *GaugeGrid = mat.Grid();
-      autoView( Atilde_v , Atilde, CpuRead);
+      Lattice<iScalar<vInteger> > coor(GaugeGrid);
-      autoView( Btilde_v , Btilde, CpuRead);
+
-      thread_for(ss,tmp.Grid()->oSites(),{
+      if( Params.twists[mu] ){
-	  for (int s = 0; s < Ls; s++) {
+	LatticeCoordinate(coor,mu);
 	    int sF = s + Ls * ss;
 	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
 	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
      }
      autoView( mat_v , mat, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
      accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{	  
  	  int sU=sss;
  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
  	  ColorMatrixType sum;
  	  zeroit(sum);
  	  for(int s=0;s<Ls;s++){
  	    int sF = s+Ls*sU;
  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
 	      //Flavor 0
  	      auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
  	      auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
  	      sum = sum + outerProduct(bb,aa);
  	      //Flavor 1
  	      bb = coalescedRead(Btilde_v[sF](1)(spn) );
  	      aa = coalescedRead(Atilde_v[sF](1)(spn) );
  	      sum = sum + conjugate(outerProduct(bb,aa));
  	    }
  	  }	    
  	  coalescedWrite(mat_v[sU](mu)(), sum);
  	});
    }
    PokeIndex<LorentzIndex>(mat, tmp, mu);
    return;
  }
 };
 typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
@@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -52,18 +52,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -47,18 +47,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  ////////////////////////////////////////
  // Performance monitoring
  ////////////////////////////////////////
  void Report(void);
  void ZeroCounters(void);
  double DhopTotalTime;
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
  ///////////////////////////////////////////////////////////////
@@ -4,10 +4,11 @@
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -29,7 +30,9 @@
 #pragma once
-#include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@@ -49,19 +52,16 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
-template <class Impl>
+template<class Impl, class CloverHelpers>
-class WilsonCloverFermion : public WilsonFermion<Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
-  template <typename vtype>
+  INHERIT_CLOVER_TYPES(Impl);
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
 public:
  typedef WilsonFermion<Impl>       WilsonBase;
  typedef WilsonCloverHelpers<Impl> Helpers;
  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
@@ -72,42 +72,7 @@ public:
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
-                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                      const ImplParams &impl_p = ImplParams());
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual void M(const FermionField &in, FermionField &out);
  virtual void Mdag(const FermionField &in, FermionField &out);
@@ -124,250 +89,21 @@ public:
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
-  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
  {
    conformable(X.Grid(), Y.Grid());
    conformable(X.Grid(), force.Grid());
    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
    GaugeField clover_force(force.Grid());
    PropagatorField Lambda(force.Grid());
-    // Guido: Here we are hitting some performance issues:
+public:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
    Impl::extractLinkField(U, this->Umu);
    force = Zero();
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = Zero();
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = Zero();
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 protected:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
-  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverField CloverTerm, CloverTermInv;                     // Clover term
-  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverField CloverTermEven, CloverTermOdd;                 // Clover term EO
-  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverField CloverTermInvEven, CloverTermInvOdd;           // Clover term Inv EO
-  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverField CloverTermDagEven, CloverTermDagOdd;           // Clover term Dag EO
-  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+  CloverField CloverTermInvDagEven, CloverTermInvDagOdd;     // Clover term Inv Dag EO
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
      T_v[i]()(2, 3) = -F_v[i]()();
      T_v[i]()(3, 2) = F_v[i]()();
    });
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
      T_v[i]()(2, 3) = (F_v[i]()());
      T_v[i]()(3, 2) = -(F_v[i]()());
    });
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
      T_v[i]()(3, 3) = timesI(F_v[i]()());
    });
    return T;
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,763 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 // Helper routines that implement common clover functionality
 NAMESPACE_BEGIN(Grid);
 template<class Impl> class WilsonCloverHelpers {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda.Grid(), U[0].Grid());
    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
  static CloverField fillCloverYZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXZ(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v, T,AcceleratorWrite);
    autoView(F_v, F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
    });
    return T;
  }
  static CloverField fillCloverXY(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView(T_v,T,AcceleratorWrite);
    autoView(F_v,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverXT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T, AcceleratorWrite);
    autoView( F_v , F, AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverYT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v ,T,AcceleratorWrite);
    autoView( F_v ,F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
    });
    return T;
  }
  static CloverField fillCloverZT(const GaugeLinkField &F)
  {
    CloverField T(F.Grid());
    T = Zero();
    autoView( T_v , T,AcceleratorWrite);
    autoView( F_v , F,AcceleratorRead);
    accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
    {
      coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
      coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
      coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
    });
    return T;
  }
  template<class _Spinor>
  static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
    auto CC = coalescedRead(C);
    mult(&phi, &CC, &chi);
  }
  template<class _SpinorField>
  inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
    const int Nsimd = SiteSpinor::Nsimd();
    autoView(out_v, out, AcceleratorWrite);
    autoView(phi_v, phi, AcceleratorRead);
    autoView(C_v,   C,   AcceleratorRead);
    typedef decltype(coalescedRead(out_v[0])) calcSpinor;
    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
      calcSpinor tmp;
      multClover(tmp,C_v[sss],phi_v(sss));
      coalescedWrite(out_v[sss],tmp);
    });
  }
 };
 ////////////////////////////////////////////////////////
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
  INHERIT_COMPACT_CLOVER_SIZES(Impl);
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  #if 0
  static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #else
  template<typename vobj>
  static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
    assert(i != j);
    if(i < j) {
      return triangle()(block)(triangle_index(i, j));
    } else { // i > j
      return conjugate(triangle()(block)(triangle_index(i, j)));
    }
  }
  #endif
  static accelerator_inline int triangle_index(int i, int j) {
    if(i == j)
      return 0;
    else if(i < j)
      return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
    else // i > j
      return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
  }
  static void MooeeKernel_gpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(in_v,       in,       AcceleratorRead);
    autoView(out_v,      out,      AcceleratorWrite);
    typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
    const uint64_t NN = Nsite * Ls;
    accelerator_for(ss, NN, Simd::Nsimd(), {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v(sF);
      auto diagonal_t = diagonal_v(sU);
      auto triangle_t = triangle_v(sU);
      for(int block=0; block<Nhs; block++) {
        int s_start = block*Nhs;
        for(int i=0; i<Nred; i++) {
          int si = s_start + i/Nc, ci = i%Nc;
          res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
          for(int j=0; j<Nred; j++) {
            if (j == i) continue;
            int sj = s_start + j/Nc, cj = j%Nc;
            res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
          };
        };
      };
      coalescedWrite(out_v[sF], res);
    });
  }
  static void MooeeKernel_cpu(int                        Nsite,
                              int                        Ls,
                              const FermionField&        in,
                              FermionField&              out,
                              const CloverDiagonalField& diagonal,
                              const CloverTriangleField& triangle) {
    autoView(diagonal_v, diagonal, CpuRead);
    autoView(triangle_v, triangle, CpuRead);
    autoView(in_v,       in,       CpuRead);
    autoView(out_v,      out,      CpuWrite);
    typedef SiteSpinor CalcSpinor;
 #if defined(A64FX) || defined(A64FXFIXEDSIZE)
 #define PREFETCH_CLOVER(BASE) {                                     \
    uint64_t base;                                                  \
    int pf_dist_L1 = 1;                                             \
    int pf_dist_L2 = -5; /* -> penalty -> disable */                \
                                                                    \
    if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
    }                                                               \
                                                                    \
    if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {           \
      base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);               \
      svprfd(svptrue_b64(), (int64_t*)(base +    0), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  256), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  512), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base +  768), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
      svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
    }                                                               \
  }
 // TODO: Implement/generalize this for other architectures
 // I played around a bit on KNL (see below) but didn't bring anything
 // #elif defined(AVX512)
 // #define PREFETCH_CLOVER(BASE) {                              \
 //     uint64_t base;                                           \
 //     int pf_dist_L1 = 1;                                      \
 //     int pf_dist_L2 = +4;                                     \
 //                                                              \
 //     if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T0); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T0); \
 //     }                                                        \
 //                                                              \
 //     if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) {    \
 //       base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0);        \
 //       _mm_prefetch((const char*)(base +    0), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +   64), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  128), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  192), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  256), _MM_HINT_T1); \
 //       _mm_prefetch((const char*)(base +  320), _MM_HINT_T1); \
 //     }                                                        \
 //   }
 #else
 #define PREFETCH_CLOVER(BASE)
 #endif
    const uint64_t NN = Nsite * Ls;
    thread_for(ss, NN, {
      int sF = ss;
      int sU = ss/Ls;
      CalcSpinor res;
      CalcSpinor in_t = in_v[sF];
      auto diag_t     = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
      auto triangle_t = triangle_v[sU];
      // upper half
      PREFETCH_CLOVER(0);
      auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
      auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
      auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
      auto in_cc_1_0 = conjugate(in_t()(1)(0));
      auto in_cc_1_1 = conjugate(in_t()(1)(1));
      res()(0)(0) =               diag_t()(0)( 0) * in_t()(0)(0)
                  +           triangle_t()(0)( 0) * in_t()(0)(1)
                  +           triangle_t()(0)( 1) * in_t()(0)(2)
                  +           triangle_t()(0)( 2) * in_t()(1)(0)
                  +           triangle_t()(0)( 3) * in_t()(1)(1)
                  +           triangle_t()(0)( 4) * in_t()(1)(2);
      res()(0)(1) =           triangle_t()(0)( 0) * in_cc_0_0;
      res()(0)(1) =               diag_t()(0)( 1) * in_t()(0)(1)
                  +           triangle_t()(0)( 5) * in_t()(0)(2)
                  +           triangle_t()(0)( 6) * in_t()(1)(0)
                  +           triangle_t()(0)( 7) * in_t()(1)(1)
                  +           triangle_t()(0)( 8) * in_t()(1)(2)
                  + conjugate(       res()(0)( 1));
      res()(0)(2) =           triangle_t()(0)( 1) * in_cc_0_0
                  +           triangle_t()(0)( 5) * in_cc_0_1;
      res()(0)(2) =               diag_t()(0)( 2) * in_t()(0)(2)
                  +           triangle_t()(0)( 9) * in_t()(1)(0)
                  +           triangle_t()(0)(10) * in_t()(1)(1)
                  +           triangle_t()(0)(11) * in_t()(1)(2)
                  + conjugate(       res()(0)( 2));
      res()(1)(0) =           triangle_t()(0)( 2) * in_cc_0_0
                  +           triangle_t()(0)( 6) * in_cc_0_1
                  +           triangle_t()(0)( 9) * in_cc_0_2;
      res()(1)(0) =               diag_t()(0)( 3) * in_t()(1)(0)
                  +           triangle_t()(0)(12) * in_t()(1)(1)
                  +           triangle_t()(0)(13) * in_t()(1)(2)
                  + conjugate(       res()(1)( 0));
      res()(1)(1) =           triangle_t()(0)( 3) * in_cc_0_0
                  +           triangle_t()(0)( 7) * in_cc_0_1
                  +           triangle_t()(0)(10) * in_cc_0_2
                  +           triangle_t()(0)(12) * in_cc_1_0;
      res()(1)(1) =               diag_t()(0)( 4) * in_t()(1)(1)
                  +           triangle_t()(0)(14) * in_t()(1)(2)
                  + conjugate(       res()(1)( 1));
      res()(1)(2) =           triangle_t()(0)( 4) * in_cc_0_0
                  +           triangle_t()(0)( 8) * in_cc_0_1
                  +           triangle_t()(0)(11) * in_cc_0_2
                  +           triangle_t()(0)(13) * in_cc_1_0
                  +           triangle_t()(0)(14) * in_cc_1_1;
      res()(1)(2) =               diag_t()(0)( 5) * in_t()(1)(2)
                  + conjugate(       res()(1)( 2));
      vstream(out_v[sF]()(0)(0), res()(0)(0));
      vstream(out_v[sF]()(0)(1), res()(0)(1));
      vstream(out_v[sF]()(0)(2), res()(0)(2));
      vstream(out_v[sF]()(1)(0), res()(1)(0));
      vstream(out_v[sF]()(1)(1), res()(1)(1));
      vstream(out_v[sF]()(1)(2), res()(1)(2));
      // lower half
      PREFETCH_CLOVER(1);
      auto in_cc_2_0 = conjugate(in_t()(2)(0));
      auto in_cc_2_1 = conjugate(in_t()(2)(1));
      auto in_cc_2_2 = conjugate(in_t()(2)(2));
      auto in_cc_3_0 = conjugate(in_t()(3)(0));
      auto in_cc_3_1 = conjugate(in_t()(3)(1));
      res()(2)(0) =               diag_t()(1)( 0) * in_t()(2)(0)
                  +           triangle_t()(1)( 0) * in_t()(2)(1)
                  +           triangle_t()(1)( 1) * in_t()(2)(2)
                  +           triangle_t()(1)( 2) * in_t()(3)(0)
                  +           triangle_t()(1)( 3) * in_t()(3)(1)
                  +           triangle_t()(1)( 4) * in_t()(3)(2);
      res()(2)(1) =           triangle_t()(1)( 0) * in_cc_2_0;
      res()(2)(1) =               diag_t()(1)( 1) * in_t()(2)(1)
                  +           triangle_t()(1)( 5) * in_t()(2)(2)
                  +           triangle_t()(1)( 6) * in_t()(3)(0)
                  +           triangle_t()(1)( 7) * in_t()(3)(1)
                  +           triangle_t()(1)( 8) * in_t()(3)(2)
                  + conjugate(       res()(2)( 1));
      res()(2)(2) =           triangle_t()(1)( 1) * in_cc_2_0
                  +           triangle_t()(1)( 5) * in_cc_2_1;
      res()(2)(2) =               diag_t()(1)( 2) * in_t()(2)(2)
                  +           triangle_t()(1)( 9) * in_t()(3)(0)
                  +           triangle_t()(1)(10) * in_t()(3)(1)
                  +           triangle_t()(1)(11) * in_t()(3)(2)
                  + conjugate(       res()(2)( 2));
      res()(3)(0) =           triangle_t()(1)( 2) * in_cc_2_0
                  +           triangle_t()(1)( 6) * in_cc_2_1
                  +           triangle_t()(1)( 9) * in_cc_2_2;
      res()(3)(0) =               diag_t()(1)( 3) * in_t()(3)(0)
                  +           triangle_t()(1)(12) * in_t()(3)(1)
                  +           triangle_t()(1)(13) * in_t()(3)(2)
                  + conjugate(       res()(3)( 0));
      res()(3)(1) =           triangle_t()(1)( 3) * in_cc_2_0
                  +           triangle_t()(1)( 7) * in_cc_2_1
                  +           triangle_t()(1)(10) * in_cc_2_2
                  +           triangle_t()(1)(12) * in_cc_3_0;
      res()(3)(1) =               diag_t()(1)( 4) * in_t()(3)(1)
                  +           triangle_t()(1)(14) * in_t()(3)(2)
                  + conjugate(       res()(3)( 1));
      res()(3)(2) =           triangle_t()(1)( 4) * in_cc_2_0
                  +           triangle_t()(1)( 8) * in_cc_2_1
                  +           triangle_t()(1)(11) * in_cc_2_2
                  +           triangle_t()(1)(13) * in_cc_3_0
                  +           triangle_t()(1)(14) * in_cc_3_1;
      res()(3)(2) =               diag_t()(1)( 5) * in_t()(3)(2)
                  + conjugate(       res()(3)( 2));
      vstream(out_v[sF]()(2)(0), res()(2)(0));
      vstream(out_v[sF]()(2)(1), res()(2)(1));
      vstream(out_v[sF]()(2)(2), res()(2)(2));
      vstream(out_v[sF]()(3)(0), res()(3)(0));
      vstream(out_v[sF]()(3)(1), res()(3)(1));
      vstream(out_v[sF]()(3)(2), res()(3)(2));
    });
  }
  static void MooeeKernel(int                        Nsite,
                          int                        Ls,
                          const FermionField&        in,
                          FermionField&              out,
                          const CloverDiagonalField& diagonal,
                          const CloverTriangleField& triangle) {
 #if defined(GRID_CUDA) || defined(GRID_HIP)
    MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
 #else
    MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
 #endif
  }
  static void Invert(const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle,
                     CloverDiagonalField&       diagonalInv,
                     CloverTriangleField&       triangleInv) {
    conformable(diagonal, diagonalInv);
    conformable(triangle, triangleInv);
    conformable(diagonal, triangle);
    diagonalInv.Checkerboard() = diagonal.Checkerboard();
    triangleInv.Checkerboard() = triangle.Checkerboard();
    GridBase* grid = diagonal.Grid();
    long lsites = grid->lSites();
    typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
    typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
    autoView(diagonal_v,  diagonal,  CpuRead);
    autoView(triangle_v,  triangle,  CpuRead);
    autoView(diagonalInv_v, diagonalInv, CpuWrite);
    autoView(triangleInv_v, triangleInv, CpuWrite);
    thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
      Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
      scalar_object_diagonal diagonal_tmp     = Zero();
      scalar_object_diagonal diagonal_inv_tmp = Zero();
      scalar_object_triangle triangle_tmp     = Zero();
      scalar_object_triangle triangle_inv_tmp = Zero();
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
      peekLocalSite(triangle_tmp, triangle_v, lcoor);
      // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
              else
                clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
            }
          }
        }
      }
      clover_inv_eigen = clover_eigen.inverse();
      for (long s_row=0;s_row<Ns;s_row++) {
        for (long s_col=0;s_col<Ns;s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for (long c_row=0;c_row<Nc;c_row++) {
            for (long c_col=0;c_col<Nc;c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else if(i < j)
                triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
              else
                continue;
            }
          }
        }
      }
      pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
      pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
    });
  }
  static void ConvertLayout(const CloverField&   full,
                            CloverDiagonalField& diagonal,
                            CloverTriangleField& triangle) {
    conformable(full, diagonal);
    conformable(full, triangle);
    diagonal.Checkerboard() = full.Checkerboard();
    triangle.Checkerboard() = full.Checkerboard();
    autoView(full_v,     full,     AcceleratorRead);
    autoView(diagonal_v, diagonal, AcceleratorWrite);
    autoView(triangle_v, triangle, AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else if(i < j)
                triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
              else
                continue;
            }
          }
        }
      }
    });
  }
  static void ConvertLayout(const CloverDiagonalField& diagonal,
                            const CloverTriangleField& triangle,
                            CloverField&               full) {
    conformable(full, diagonal);
    conformable(full, triangle);
    full.Checkerboard() = diagonal.Checkerboard();
    full = Zero();
    autoView(diagonal_v, diagonal, AcceleratorRead);
    autoView(triangle_v, triangle, AcceleratorRead);
    autoView(full_v,     full,     AcceleratorWrite);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, full.Grid()->oSites(), 1, {
      for(int s_row = 0; s_row < Ns; s_row++) {
        for(int s_col = 0; s_col < Ns; s_col++) {
          if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
          int block       = s_row / Nhs;
          int s_row_block = s_row % Nhs;
          int s_col_block = s_col % Nhs;
          for(int c_row = 0; c_row < Nc; c_row++) {
            for(int c_col = 0; c_col < Nc; c_col++) {
              int i = s_row_block * Nc + c_row;
              int j = s_col_block * Nc + c_col;
              if(i == j)
                full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
              else
                full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
            }
          }
        }
      }
    });
  }
  static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
    // Checks/grid
    double t0 = usecond();
    conformable(diagonal, triangle);
    GridBase* grid = diagonal.Grid();
    // Determine the boundary coordinates/sites
    double t1 = usecond();
    int t_dir = Nd - 1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    // Set off-diagonal parts at boundary to zero -- OK
    double t2 = usecond();
    CloverTriangleField zeroTriangle(grid);
    zeroTriangle.Checkerboard() = triangle.Checkerboard();
    zeroTriangle = Zero();
    triangle = where(t_coor == 0,   zeroTriangle, triangle);
    triangle = where(t_coor == T-1, zeroTriangle, triangle);
    // Set diagonal to unity (scaled correctly) -- OK
    double t3 = usecond();
    CloverDiagonalField tmp(grid);
    tmp.Checkerboard() = diagonal.Checkerboard();
    tmp                = -1.0 * csw_t + diag_mass;
    diagonal           = where(t_coor == 0,   tmp, diagonal);
    diagonal           = where(t_coor == T-1, tmp, diagonal);
    // Correct values next to boundary
    double t4 = usecond();
    if(cF != 1.0) {
      tmp = cF - 1.0;
      tmp += diagonal;
      diagonal = where(t_coor == 1,   tmp, diagonal);
      diagonal = where(t_coor == T-2, tmp, diagonal);
    }
    // Report timings
    double t5 = usecond();
 #if 0
    std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
              << " checks = "          << (t1 - t0) / 1e6
              << ", coordinate = "     << (t2 - t1) / 1e6
              << ", off-diag zero = "  << (t3 - t2) / 1e6
              << ", diagonal unity = " << (t4 - t3) / 1e6
              << ", near-boundary = "  << (t5 - t4) / 1e6
              << ", total = "          << (t5 - t0) / 1e6
              << std::endl;
 #endif
  }
  template<class Field, class Mask>
  static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
    conformable(f, m);
    auto grid  = f.Grid();
    const uint32_t Nsite = grid->oSites();
    const uint32_t Nsimd = grid->Nsimd();
    autoView(f_v, f, AcceleratorWrite);
    autoView(m_v, m, AcceleratorRead);
    // NOTE: this function cannot be 'private' since nvcc forbids this for kernels
    accelerator_for(ss, Nsite, Nsimd, {
      coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
    });
  }
  template<class MaskField>
  static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
    assert(!full.Grid()->_isCheckerBoarded);
    GridBase* grid = full.Grid();
    int t_dir = Nd-1;
    Lattice<iScalar<vInteger>> t_coor(grid);
    LatticeCoordinate(t_coor, t_dir);
    int T = grid->GlobalDimensions()[t_dir];
    MaskField zeroMask(grid); zeroMask = Zero();
    full = 1.0;
    full = where(t_coor == 0,   zeroMask, full);
    full = where(t_coor == T-1, zeroMask, full);
    pickCheckerboard(Even, even, full);
    pickCheckerboard(Odd,  odd,  full);
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,90 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
    Copyright (C) 2021 - 2022
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Impl>
 class WilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteClover;
  typedef Lattice<SiteClover> CloverField;
 };
 template<class Impl>
 class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
  static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
  template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
  template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
  typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
  typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
  typedef iSinglet<Simd>            SiteMask;
  typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
  typedef Lattice<SiteCloverTriangle> CloverTriangleField;
  typedef Lattice<SiteMask>           MaskField;
 };
 #define INHERIT_CLOVER_TYPES(Impl)                                 \
  typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
  typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
 #define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal  SiteCloverDiagonal; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle  SiteCloverTriangle; \
  typedef typename CompactWilsonCloverTypes<Impl>::SiteMask            SiteMask; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
  typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
  typedef typename CompactWilsonCloverTypes<Impl>::MaskField           MaskField; \
  /* ugly duplication but needed inside functionality classes */ \
  template<typename vtype> using iImplCloverDiagonal = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
  template<typename vtype> using iImplCloverTriangle = \
    iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
 #define INHERIT_COMPACT_CLOVER_SIZES(Impl)                                    \
  static constexpr int Nred      = CompactWilsonCloverTypes<Impl>::Nred;      \
  static constexpr int Nblock    = CompactWilsonCloverTypes<Impl>::Nblock;    \
  static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
  static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
 NAMESPACE_END(Grid);
@@ -32,17 +32,218 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////
 // Wilson compressor will need FaceGather policies for:
 // Periodic, Dirichlet, and partial Dirichlet for DWF
 ///////////////////////////////////////////////////////////////
 const int dwf_compressor_depth=2;
 #define DWF_COMPRESS
 class FaceGatherPartialDWF
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else
  static int PartialCompressionFactor(GridBase *grid) { return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 				   const Lattice<vobj> &rhs,
 				   cobj *buffer,
 				   compressor &compress,
 				   int off,int so,int partial)
  {
    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
    //  Shrinks local and remote comms buffers
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else 
    int depth=Ls/2;
 #endif
    std::pair<int,int> *table_v = & table[0];
    auto rhs_v = rhs.View(AcceleratorRead);
    int vol=table.size()/Ls;
    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
 	Integer i=idx/Ls;
 	Integer s=idx%Ls;
 	Integer sc=depth+s-(Ls-depth);
 	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
 	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
    });
    rhs_v.ViewClose();
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    auto Ls = dd.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth=Ls/2;
 #endif    
    // Just pass in the Grid
    auto kp = dd.kernel_p;
    auto mp = dd.mpi_p;
    int size= dd.buffer_size;
    int vol= size/Ls;
    accelerator_forNB(o,size,1,{
 	int idx=o/Ls;
 	int   s=o%Ls;
 	if ( s < depth ) {
 	  int oo=s*vol+idx;
 	  kp[o]=mp[oo];
 	} else if ( s >= Ls-depth ) {
 	  int sc = depth + s - (Ls-depth);
 	  int oo=sc*vol+idx; 
 	  kp[o]=mp[oo];
 	} else {
 	  kp[o] = Zero();//fill rest with zero if partial dirichlet
 	}
    });
  }
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Need to gather *interior portions* for ALL s-slices in simd directions
  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
  ////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    GridBase *Grid = rhs.Grid();
    int Ls = Grid->_rdimensions[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    // insertion of zeroes...
    assert( (table.size()&0x1)==0);
    int num=table.size()/2;
    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
    auto rhs_v = rhs.View(AcceleratorRead);
    auto p0=&pointers[0][0];
    auto p1=&pointers[1][0];
    auto tp=&table[0];
    int nnum=num/Ls;
    accelerator_forNB(j, num, vobj::Nsimd(), {
 	//  Reorders both local and remote comms buffers
 	//  
 	int s  = j % Ls;
 	int sp1 = (s+depth)%Ls;  // peri incremented s slice
 	int hxyz= j/Ls;
 	int xyz0= hxyz*2; // xyzt part of coor
 	int xyz1= hxyz*2+1;
 	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
 	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
 	compress.CompressExchange(p0[jj],p1[jj],
 				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
 				  rhs_v[so+tp[kk1 ].second], 
 				  type);
    });
    rhs_v.ViewClose();
  }
  // Merge routine is for SIMD faces
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    auto Ls = mm.dims[0];
 #ifdef DWF_COMPRESS
    int depth=dwf_compressor_depth;
 #else
    int depth = Ls/2;
 #endif
    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
    auto mp = &mm.mpointer[0];
    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
    auto vp1= &mm.vpointers[1][0];
    auto type= mm.type;
    int nnum = num/Ls;
    accelerator_forNB(o,num,Merger::Nsimd,{
 	int  s=o%Ls;
 	int hxyz=o/Ls; // xyzt related component
 	int xyz0=hxyz*2;
 	int xyz1=hxyz*2+1;
 	int sp = (s+depth)%Ls; 
 	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
 	int oo0= s+xyz0*Ls;
 	int oo1= s+xyz1*Ls;
 	// same ss0, ss1 pair goes to new layout
 	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
      });
  }
 };
 class FaceGatherDWFMixedBCs
 {
 public:
 #ifdef DWF_COMPRESS
  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
 #else 
  static int PartialCompressionFactor(GridBase *grid) {return 1;}
 #endif
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
 					 const Lattice<vobj> &rhs,
 					 cobj *buffer,
 					 compressor &compress,
 					 int off,int so,int partial)
  {
    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
  }
  template<class vobj,class cobj,class compressor>
  static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
  {
    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
  }
  template<class decompressor,class Merger>
  static void MergeFace(decompressor decompress,Merger &mm)
  {
    int partial = mm.partial;
    //    std::cout << " merge DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
    else           FaceGatherSimple::MergeFace(decompress,mm);
  }
  template<class decompressor,class Decompression>
  static void DecompressFace(decompressor decompress,Decompression &dd)
  {
    int partial = dd.partial;
    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
    else           FaceGatherSimple::DecompressFace(decompress,dd);
  }
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
-// optimised versions supporting half precision too
+// optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
 class WilsonCompressorTemplate;
 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
+class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
-				typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
+//  : public FaceGatherSimple
 {
 public:
@@ -79,18 +280,19 @@ public:
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Exchange(SiteHalfSpinor *mp,
+  accelerator_inline void Exchange(SiteHalfSpinor &mp0,
-				   const SiteHalfSpinor * __restrict__ vp0,
+				   SiteHalfSpinor &mp1,
-				   const SiteHalfSpinor * __restrict__ vp1,
+				   const SiteHalfSpinor & vp0,
-				   Integer type,Integer o) const {
+				   const SiteHalfSpinor & vp1,
 				   Integer type) const {
 #ifdef GRID_SIMT
-    exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
+    exchangeSIMT(mp0,mp1,vp0,vp1,type);
 #else
    SiteHalfSpinor tmp1;
    SiteHalfSpinor tmp2;
-    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
+    exchange(tmp1,tmp2,vp0,vp1,type);
-    vstream(mp[2*o  ],tmp1);
+    vstream(mp0,tmp1);
-    vstream(mp[2*o+1],tmp2);
+    vstream(mp1,tmp2);
 #endif
  }
@@ -98,153 +300,61 @@ public:
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
+  accelerator_inline void Decompress(SiteHalfSpinor &out,
-				     SiteHalfSpinor * __restrict__ in, Integer o) const {    
+				     SiteHalfSpinor &in) const {    
-    assert(0);
+    out = in;
  }
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
-  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
+  accelerator_inline void CompressExchange(SiteHalfSpinor &out0,
-					   SiteHalfSpinor * __restrict__ out1,
+					   SiteHalfSpinor &out1,
-					   const SiteSpinor * __restrict__ in,
+					   const SiteSpinor &in0,
-					   Integer j,Integer k, Integer m,Integer type) const
+					   const SiteSpinor &in1,
 					   Integer type) const
  {
 #ifdef GRID_SIMT
    typedef SiteSpinor vobj;
    typedef SiteHalfSpinor hvobj;
-    typedef decltype(coalescedRead(*in))    sobj;
+    typedef decltype(coalescedRead(in0))    sobj;
-    typedef decltype(coalescedRead(*out0)) hsobj;
+    typedef decltype(coalescedRead(out0)) hsobj;
-    unsigned int Nsimd = vobj::Nsimd();
+    constexpr unsigned int Nsimd = vobj::Nsimd();
    unsigned int mask = Nsimd >> (type + 1);
    int lane = acceleratorSIMTlane(Nsimd);
    int j0 = lane &(~mask); // inner coor zero
    int j1 = lane |(mask) ; // inner coor one
-    const vobj *vp0 = &in[k];
+    const vobj *vp0 = &in0;
-    const vobj *vp1 = &in[m];
+    const vobj *vp1 = &in1;
    const vobj *vp = (lane&mask) ? vp1:vp0;
    auto sa = coalescedRead(*vp,j0);
    auto sb = coalescedRead(*vp,j1);
    hsobj psa, psb;
    projector::Proj(psa,sa,mu,dag);
    projector::Proj(psb,sb,mu,dag);
-    coalescedWrite(out0[j],psa);
+    coalescedWrite(out0,psa);
-    coalescedWrite(out1[j],psb);
+    coalescedWrite(out1,psb);
 #else
    SiteHalfSpinor temp1, temp2;
    SiteHalfSpinor temp3, temp4;
-    projector::Proj(temp1,in[k],mu,dag);
+    projector::Proj(temp1,in0,mu,dag);
-    projector::Proj(temp2,in[m],mu,dag);
+    projector::Proj(temp2,in1,mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
-    vstream(out0[j],temp3);
+    vstream(out0,temp3);
-    vstream(out1[j],temp4);
+    vstream(out1,temp4);
 #endif
  }
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  accelerator_inline bool DecompressionStep(void) const { return false; }
+  accelerator_inline bool DecompressionStep(void) const {
    return false;
  }
 };
 #if 0
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 				typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 {
 public:
  int mu,dag;  
  void Point(int p) { mu=p; };
  WilsonCompressorTemplate(int _dag=0){
    dag = _dag;
  }
  typedef _Spinor         SiteSpinor;
  typedef _Hspinor     SiteHalfSpinor;
  typedef _HCspinor SiteHalfCommSpinor;
  typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
  accelerator_inline int CommDatumSize(void) const {
    return sizeof(SiteHalfCommSpinor);
  }
  /*****************************************************/
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
    SiteHalfSpinor hsp;
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
    projector::Proj(hsp,in,mu,dag);
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
  }
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
 #ifdef GRID_SIMT
    typedef decltype(coalescedRead(buf)) sobj;
    sobj sp;
    auto sin = coalescedRead(in);
    projector::Proj(sp,sin,mu,dag);
    coalescedWrite(buf,sp);
 #else
    projector::Proj(buf,in,mu,dag);
 #endif
  }
  /*****************************************************/
  /* Exchange includes precision change if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Exchange(SiteHalfSpinor *mp,
                       SiteHalfSpinor *vp0,
                       SiteHalfSpinor *vp1,
 		       Integer type,Integer o) const {
    SiteHalfSpinor vt0,vt1;
    SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
    SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
    precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
    precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
    exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
  }
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
  }
  /*****************************************************/
  /* Compress Exchange                                 */
  /*****************************************************/
  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
 			       SiteHalfSpinor *out1,
 			       const SiteSpinor *in,
 			       Integer j,Integer k, Integer m,Integer type) const {
    SiteHalfSpinor temp1, temp2,temp3,temp4;
    SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
    SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
    projector::Proj(temp1,in[k],mu,dag);
    projector::Proj(temp2,in[m],mu,dag);
    exchange(temp3,temp4,temp1,temp2,type);
    precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
    precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
  }
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
  accelerator_inline bool DecompressionStep(void) const { return true; }
 };
 #endif
 #define DECLARE_PROJ(Projector,Compressor,spProj)			\
  class Projector {							\
  public:								\
@@ -294,11 +404,7 @@ public:
  typedef typename Base::View_type View_type;
  typedef typename Base::StencilVector StencilVector;
-  void ZeroCountersi(void)  {  }
+  //  Vector<int> surface_list;
  void Reporti(int calls)  {  }
  std::vector<int> surface_list;
  WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
@@ -306,11 +412,11 @@ public:
 		const std::vector<int> &distances,Parameters p)  
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
-    ZeroCountersi();
+    //    surface_list.resize(0);
    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
  /*
  void BuildSurfaceList(int Ls,int vol4){
    // find same node for SHM
@@ -331,6 +437,7 @@ public:
      }
    }
  }
  */
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
@@ -377,24 +484,26 @@ public:
    int dag = compress.dag;
    int face_idx=0;
 #define vet_same_node(a,b) \
      { auto tmp = b;  }
    if ( dag ) { 
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
    } else {
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
@@ -74,20 +74,6 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
  void Report(void);
  void ZeroCounters(void);
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
@@ -75,19 +75,8 @@ public:
  FermionField _tmp;
  FermionField &tmp(void) { return _tmp; }
-  void Report(void);
+  int Dirichlet;
-  void ZeroCounters(void);
+  Coordinate Block; 
  double DhopCalls;
  double DhopCommTime;
  double DhopComputeTime;
  double DhopComputeTime2;
  double DhopFaceTime;
  double DhopTotalTime;
  double DerivCalls;
  double DerivCommTime;
  double DerivComputeTime;
  double DerivDhopComputeTime;
  ///////////////////////////////////////////////////////////////
  // Implement the abstract base
@@ -174,6 +163,9 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
  virtual void DirichletBlock(const Coordinate & block)
  {
  }
  // Constructors
  /*
    WilsonFermion5D(int simd, 
@@ -242,18 +242,12 @@ public:
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR;  // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
-
+typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > WilsonImplD2;  // Double
 //typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
 //typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
 //typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
-
+typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffComplex > ZWilsonImplD2; // Double
 //typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
 //typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
 //typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
 typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
 typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
@@ -52,13 +52,6 @@ public:
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:
 #ifdef GRID_SYCL
 #define SYCL_HACK
 #endif  
 #ifdef SYCL_HACK
  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
 			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
 #endif
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass(_mass)
+  mass_plus(_mass), mass_minus(_mass)
 { 
 }
@@ -152,65 +152,13 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
  Coordinate latt = GridDefaultLatt();          
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP     = this->_FourDimGrid->_Nprocessors;
  if ( M5Dcalls > 0 ) {
    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
    // Flops = 10.0*(Nc*Ns) *Ls*vol
    RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
    // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
    // write = 1
    RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
    std::cout << GridLogMessage << "Average bandwidth (GB/s)                 : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
  }
  if ( MooeeInvCalls > 0 ) {
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
 #ifdef GRID_CUDA
    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 #else
    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 #endif
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 {
  this->ZeroCounters();
  M5Dflops=0;
  M5Dcalls=0;
  M5Dtime=0;
  MooeeInvFlops=0;
  MooeeInvCalls=0;
  MooeeInvTime=0;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -220,8 +168,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@@ -235,8 +183,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -250,8 +198,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -266,9 +214,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass*cee[Ls-1];
+      lower[s] = mass_minus*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass*cee[0];
+      upper[s] = mass_plus*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@@ -291,8 +239,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_plus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
@@ -307,9 +255,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass*cs[Ls-1];
+      lower[s] =-mass_minus*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass*cs[0];
+      upper[s] =-mass_plus*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@@ -552,7 +500,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      leem[i]=mass*cee[Ls-1]/bee[0];
+      leem[i]=mass_minus*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@@ -560,7 +508,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      ueem[i]=mass;
+      ueem[i]=mass_plus;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
@@ -573,7 +521,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
  { 
-    Coeff_t delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass_minus*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@@ -642,7 +590,10 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
-#if (!defined(GRID_HIP))
+
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
@@ -761,7 +712,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
    else          q_out +=     C;
  }
-#endif
+
 }
 template <class Impl>
@@ -777,6 +728,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
@@ -826,8 +779,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
 #endif
 #if (!defined(GRID_HIP))
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -880,7 +833,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
-  RealD sign = 1; // sign flip for vector/tadpole
+  RealD sign = 1.0; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
@@ -890,7 +843,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
-      sign = -1;    
+      sign = -1.0;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@@ -934,12 +887,17 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
-    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    // Mask the time
    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
      unsigned int t0 = 0;
      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
    } else {
      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
    }
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    InsertSlice(L_Q, q_out, s , 0);
  }
 #endif
 }
 #undef Pp
 #undef Pm
@@ -947,88 +905,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #undef TopRowWithSource
 #if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
 						 Vector<iSinglet<Simd> > & Matp,
 						 Vector<iSinglet<Simd> > & Matm)
 {
  int Ls=this->Ls;
  GridBase *grid = this->FermionRedBlackGrid();
  int LLs = grid->_rdimensions[0];
  if ( LLs == Ls ) {
    return; // Not vectorised in 5th direction
  }
  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXcd PplusMat ;
  Eigen::MatrixXcd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
  const int Nsimd=Simd::Nsimd();
  Matp.resize(Ls*LLs);
  Matm.resize(Ls*LLs);
  for(int s2=0;s2<Ls;s2++){
    for(int s1=0;s1<LLs;s1++){
      int istride = LLs;
      int ostride = 1;
      Simd Vp;
      Simd Vm;
      scalar_type *sp = (scalar_type *)&Vp;
      scalar_type *sm = (scalar_type *)&Vm;
      for(int l=0;l<Nsimd;l++){
 	if ( switcheroo<Coeff_t>::iscomplex() ) {
 	  sp[l] = PplusMat (l*istride+s1*ostride,s2);
 	  sm[l] = PminusMat(l*istride+s1*ostride,s2);
 	} else { 
 	  // if real
 	  scalar_type tmp;
 	  tmp = PplusMat (l*istride+s1*ostride,s2);
 	  sp[l] = scalar_type(tmp.real(),tmp.real());
 	  tmp = PminusMat(l*istride+s1*ostride,s2);
 	  sm[l] = scalar_type(tmp.real(),tmp.real());
 	}
      }
      Matp[LLs*s2+s1] = Vp;
      Matm[LLs*s2+s1] = Vm;
    }}
 }
 #endif
 NAMESPACE_END(Grid);
@@ -63,23 +63,18 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  // 10 = 3 complex mult + 2 complex add
  // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
-  M5Dcalls++;
+  uint64_t nloop = grid->oSites();
  M5Dtime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss= sss*Ls;
+    uint64_t s = sss%Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1, tmp2;
    for(int s=0;s<Ls;s++){
    uint64_t idx_u = ss+((s+1)%Ls);
    uint64_t idx_l = ss+((s+Ls-1)%Ls);
    spProj5m(tmp1,psi(idx_u));
    spProj5p(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
 template<class Impl>  
@@ -105,23 +100,18 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  int Ls=this->Ls;
  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
+  uint64_t nloop = grid->oSites();
  M5Dtime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss=sss*Ls;
+    uint64_t s = sss%Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1,tmp2;
    for(int s=0;s<Ls;s++){
    uint64_t idx_u = ss+((s+1)%Ls);
    uint64_t idx_l = ss+((s+Ls-1)%Ls);
    spProj5p(tmp1,psi(idx_u));
    spProj5m(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
 template<class Impl>
@@ -142,8 +132,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  auto pleem = & leem[0];
  auto pueem = & ueem[0];
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -181,8 +169,6 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
    }
  });
  MooeeInvTime+=usecond();
 }
 template<class Impl>
@@ -204,10 +190,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  assert(psi.Checkerboard() == psi.Checkerboard());
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  uint64_t nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -244,7 +226,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
      coalescedWrite(chi[ss+s],res);
    }
  });
  MooeeInvTime+=usecond();
 }
@@ -94,10 +94,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}
  M5Dcalls++;
  M5Dtime-=usecond();
  assert(Nc==3);
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
@@ -198,7 +194,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
    }
 #endif
  });
  M5Dtime+=usecond();
 }
 template<class Impl>  
@@ -242,8 +237,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
      d_p[ss] = diag[s];
    }}
  M5Dcalls++;
  M5Dtime-=usecond();
  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
    alignas(64) SiteHalfSpinor hp;
@@ -339,7 +332,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
    }
 #endif
  });
  M5Dtime+=usecond();
 }
@@ -813,9 +805,6 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
  }
  assert(_Matp->size()==Ls*LLs);
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  if ( switcheroo<Coeff_t>::iscomplex() ) {
    thread_loop( (auto site=0;site<vol;site++),{
      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
@@ -825,7 +814,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    });
  }
-  MooeeInvTime+=usecond();
+
 }
 NAMESPACE_END(Grid);
@@ -0,0 +1,377 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
                                                                            GridCartesian& Fgrid,
                                                                            GridRedBlackCartesian& Hgrid,
                                                                            const RealD _mass,
                                                                            const RealD _csw_r,
                                                                            const RealD _csw_t,
                                                                            const RealD _cF,
                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
                                                                            const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
  , DiagonalInv(&Fgrid),     TriangleInv(&Fgrid)
  , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
  , DiagonalInvOdd(&Hgrid),  TriangleInvOdd(&Hgrid)
  , Tmp(&Fgrid)
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
  if (fixedBoundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->fixedBoundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
    } else {
      MooeeInternal(in, out, DiagonalEven, TriangleEven);
    }
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
    } else {
      MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
    }
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!fixedBoundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
        continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  out.Checkerboard() = in.Checkerboard();
  conformable(in, out);
  conformable(in, diagonal);
  conformable(in, triangle);
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
  double t0 = usecond();
  WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
  // Initialize temporary variables
  double t1 = usecond();
  conformable(_Umu.Grid(), this->GaugeGrid());
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
  CloverField TmpInverse(grid);
  // Compute the field strength terms mu>nu
  double t2 = usecond();
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  double t3 = usecond();
  TmpOriginal  = Helpers::fillCloverYZ(Bx) * csw_r;
  TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
  TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
  // Instantiate the clover term
  // - In case of the standard clover the mass term is added
  // - In case of the exponential clover the clover term is exponentiated
  double t4 = usecond();
  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
  // Convert the data layout of the clover term
  double t5 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
  // Modify the clover term at the temporal boundaries in case of open boundary conditions
  double t6 = usecond();
  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
  // Invert the Clover term
  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
  // TODO: For now this inversion is explictly done on the CPU
  double t7 = usecond();
  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
  // Fill the remaining clover fields
  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
  pickCheckerboard(Odd,  TriangleOdd,     Triangle);
  pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
  pickCheckerboard(Even, TriangleInvEven, TriangleInv);
  pickCheckerboard(Odd,  DiagonalInvOdd,  DiagonalInv);
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
  double t9 = usecond();
  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }
 NAMESPACE_END(Grid);
@@ -54,8 +54,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  auto pupper = &upper[0];
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -71,7 +69,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -91,8 +88,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -108,7 +103,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -127,8 +121,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  auto pleem = & this->leem[0];
  auto pueem = & this->ueem[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -164,7 +156,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
      coalescedWrite(chi[ss+s],res);
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -185,8 +176,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
  assert(psi.Checkerboard() == psi.Checkerboard());
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  auto nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -223,7 +212,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
    }
  });
  this->MooeeInvTime += usecond();
 }
 NAMESPACE_END(Grid);
@@ -298,45 +298,33 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
  int LLs = in.Grid()->_rdimensions[0];
  int len =  U.Grid()->oSites();
  DhopFaceTime-=usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime+=usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Remove explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime-=usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime+=usecond();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  DhopComputeTime2-=usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2+=usecond();
 }
 template<class Impl>
@@ -347,22 +335,14 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
  Compressor compressor;
  int LLs = in.Grid()->_rdimensions[0];
 //double t1=usecond();
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
  DhopCommTime += usecond();
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 }
 /*CHANGE END*/
@@ -371,7 +351,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -383,7 +362,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -395,7 +373,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
@@ -404,58 +381,6 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
  Coordinate latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _FourDimGrid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime    = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
@@ -334,7 +334,6 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@@ -346,7 +345,6 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -359,7 +357,6 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -418,47 +415,33 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
@@ -471,78 +454,16 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
@@ -55,9 +55,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@@ -73,7 +70,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -99,9 +95,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
  auto pshift_coeffs = &shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss = sss*Ls;
@@ -122,7 +115,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -143,9 +135,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(), {
    uint64_t ss = sss*Ls;
@@ -161,8 +150,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -186,9 +173,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
  auto pshift_coeffs = &shift_coeffs[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
  auto pm = this->pm;
  int nloop = grid->oSites()/Ls;
@@ -217,7 +201,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
    }
  });
  this->M5Dtime += usecond();
 }
 template<class Impl>
@@ -237,9 +220,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
  if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -277,7 +257,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -297,8 +276,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
  auto pueem= & this->ueem[0];
  auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0];
  auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
@@ -343,7 +320,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
      }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -363,9 +339,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
  auto pleem= & this->leem[0];
  auto pueem= & this->ueem[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
@@ -402,7 +375,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
      coalescedWrite(chi[ss+s],res);
    }
  });
  this->MooeeInvTime += usecond();
 }
 template<class Impl>
@@ -423,9 +395,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0];
  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
      uint64_t ss=sss*Ls;
@@ -469,7 +438,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
      }
  });
  this->MooeeInvTime += usecond();
 }
 NAMESPACE_END(Grid);
@@ -263,7 +263,6 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@@ -275,7 +274,6 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -288,7 +286,6 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 {
  DhopCalls+=1;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -345,47 +342,33 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
  Compressor compressor; 
  int len =  U.Grid()->oSites();
  DhopTotalTime   -= usecond();
  DhopFaceTime    -= usecond();
  st.Prepare();
  st.HaloGather(in,compressor);
  DhopFaceTime    += usecond();
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  st.CommunicateBegin(requests);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+= usecond();
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Removed explicit thread comms
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  DhopComputeTime    -= usecond();
  {
    int interior=1;
    int exterior=0;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime    += usecond();
  st.CommunicateComplete(requests);
  DhopCommTime +=usecond();
  // First to enter, last to leave timing
  DhopFaceTime    -= usecond();
  st.CommsMerge(compressor);
  DhopFaceTime    -= usecond();
  DhopComputeTime2    -= usecond();
  {
    int interior=0;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime2    += usecond();
 }
 template <class Impl>
@@ -396,78 +379,16 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
 {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  DhopTotalTime   -= usecond();
  DhopCommTime    -= usecond();
  Compressor compressor;
  st.HaloExchange(in, compressor);
  DhopCommTime    += usecond();
  DhopComputeTime -= usecond();
  {
    int interior=1;
    int exterior=1;
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 };
  ////////////////////////////////////////////////////////////////
  // Reporting
  ////////////////////////////////////////////////////////////////
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::Report(void) 
 {
  Coordinate latt = _grid->GlobalDimensions();
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _grid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime   = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopFaceTime    = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
@@ -2,12 +2,13 @@
    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
-    Copyright (C) 2017
+    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,9 +34,48 @@
 NAMESPACE_BEGIN(Grid);
 template<class Impl, class CloverHelpers>
 WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
                                               const RealD                         _csw_r,
                                               const RealD                         _csw_t,
                                               const WilsonAnisotropyCoefficients& clover_anisotropy,
                                               const ImplParams&                   impl_p)
  : WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , CloverTerm(&Fgrid)
  , CloverTermInv(&Fgrid)
  , CloverTermEven(&Hgrid)
  , CloverTermOdd(&Hgrid)
  , CloverTermInvEven(&Hgrid)
  , CloverTermInvOdd(&Hgrid)
  , CloverTermDagEven(&Hgrid)
  , CloverTermDagOdd(&Hgrid)
  , CloverTermInvDagEven(&Hgrid)
  , CloverTermInvDagOdd(&Hgrid) {
  assert(Nd == 4); // require 4 dimensions
  if(clover_anisotropy.isAnisotropic) {
    csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
    diag_mass = 4.0 + _mass;
  }
  csw_t = _csw_t * 0.5;
  if(csw_r == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
  if(csw_t == 0)
    std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
  ImportGauge(_Umu);
 }
 // *NOT* EO
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -49,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -64,13 +104,16 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
  double t1 = usecond();
  GridBase *grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  double t2 = usecond();
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@@ -79,52 +122,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  double t3 = usecond();
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
-  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm  = Helpers::fillCloverYZ(Bx) * csw_r;
-  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
-  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
-  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
-  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
-  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;
-  int lvol = _Umu.Grid()->lSites();
+  double t4 = usecond();
-  int DimRep = Impl::Dimension;
+  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    thread_for(site, lvol, {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++){
 	      auto zz =  Qx()(j, k)(a, b);
 	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 	    }
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
      EigenInvCloverOp = EigenCloverOp.inverse();
      //std::cout << EigenInvCloverOp << std::endl;
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
    });
  }
  double t5 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -137,37 +148,47 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
  double t6 = usecond();
  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
-  CloverFieldType *Clover;
+  CloverField *Clover;
  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
  if (dag)
@@ -182,12 +203,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = adj(*Clover) * in;
+      Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
    }
  }
  else
@@ -205,29 +226,109 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
-      out = *Clover * in;
+      Helpers::multCloverField(out, *Clover, in);
    }
  }
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
 template<class Impl, class CloverHelpers>
 void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
  GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
  GaugeField clover_force(force.Grid());
  PropagatorField Lambda(force.Grid());
  // Guido: Here we are hitting some performance issues:
  // need to extract the components of the DoubledGaugeField
  // for each call
  // Possible solution
  // Create a vector object to store them? (cons: wasting space)
  std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
  Impl::extractLinkField(U, this->Umu);
  force = Zero();
  // Derivative of the Wilson hopping term
  this->DhopDeriv(force, X, Y, dag);
  ///////////////////////////////////////////////////////////
  // Clover term derivative
  ///////////////////////////////////////////////////////////
  Impl::outerProductImpl(Lambda, X, Y);
  //std::cout << "Lambda:" << Lambda << std::endl;
  Gamma::Algebra sigma[] = {
      Gamma::Algebra::SigmaXY,
      Gamma::Algebra::SigmaXZ,
      Gamma::Algebra::SigmaXT,
      Gamma::Algebra::MinusSigmaXY,
      Gamma::Algebra::SigmaYZ,
      Gamma::Algebra::SigmaYT,
      Gamma::Algebra::MinusSigmaXZ,
      Gamma::Algebra::MinusSigmaYZ,
      Gamma::Algebra::SigmaZT,
      Gamma::Algebra::MinusSigmaXT,
      Gamma::Algebra::MinusSigmaYT,
      Gamma::Algebra::MinusSigmaZT};
  /*
    sigma_{\mu \nu}=
    | 0         sigma[0]  sigma[1]  sigma[2] |
    | sigma[3]    0       sigma[4]  sigma[5] |
    | sigma[6]  sigma[7]     0      sigma[8] |
    | sigma[9]  sigma[10] sigma[11]   0      |
  */
  int count = 0;
  clover_force = Zero();
  for (int mu = 0; mu < 4; mu++)
  {
    force_mu = Zero();
    for (int nu = 0; nu < 4; nu++)
    {
      if (mu == nu)
      continue;
      RealD factor;
      if (nu == 4 || mu == 4)
      {
        factor = 2.0 * csw_t;
      }
      else
      {
        factor = 2.0 * csw_r;
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
    pokeLorentz(clover_force, U[mu] * force_mu, mu);
  }
  //clover_force *= csw;
  force += clover_force;
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
@@ -60,8 +60,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid),
-  _tmp(&FiveDimRedBlackGrid)
+  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
@@ -91,6 +96,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
  }
  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
      Dirichlet = 1;
      std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
      std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
      Block = block;
    }
  } else {
    Coordinate block(Nd+1,0);
    Block = block;
  }
  if (Impl::LsVectorised) { 
    int nsimd = Simd::Nsimd();
@@ -125,99 +143,38 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
  RealD NP     = _FourDimGrid->_Nprocessors;
  RealD NN     = _FourDimGrid->NodeCount();
  RealD volume = Ls;  
  Coordinate latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _FourDimGrid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::ZeroCounters(void) {
  DhopCalls       = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0;
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  GaugeField HUmu(_Umu.Grid());
  HUmu = _Umu*(-0.5);
  if ( Dirichlet ) {
    if ( this->Params.partialDirichlet ) {
      std::cout << GridLogMessage << " partialDirichlet BCs " <<Block<<std::endl;
    } else {
      std::cout << GridLogMessage << " FULL Dirichlet BCs " <<Block<<std::endl;
    }
    std:: cout << GridLogMessage << "Checking block size multiple of rank boundaries for Dirichlet"<<std::endl;
    for(int d=0;d<Nd;d++) {
      int GaugeBlock = Block[d+1];
      int ldim=GaugeGrid()->LocalDimensions()[d];
      if (GaugeBlock) assert( (GaugeBlock%ldim)==0);
    }
    if (!this->Params.partialDirichlet) {
      std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " <<Block<<std::endl;
      Coordinate GaugeBlock(Nd);
      for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
      DirichletFilter<GaugeField> Filter(GaugeBlock);
      Filter.applyFilter(HUmu);
    } else {
      std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " NOT filtered gauge field" <<std::endl;
    }
  }
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
@@ -259,7 +216,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  const FermionField &B,
 					  int dag)
 {
  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));
  conformable(st.Grid(),A.Grid());
@@ -270,15 +226,12 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B.Grid());
  FermionField Atilde(B.Grid());
  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
  DerivCommTime+=usecond();
  Atilde=A;
  int LLs = B.Grid()->_rdimensions[0];
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
@@ -290,8 +243,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    // Call the single hop
    ////////////////////////
    DerivDhopComputeTime -= usecond();
    int Usites = U.Grid()->oSites();
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
@@ -299,10 +250,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
    DerivDhopComputeTime += usecond();
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
  DerivComputeTime += usecond();
 }
 template<class Impl>
@@ -360,12 +309,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
                                         DoubledGaugeField & U,
                                         const FermionField &in, FermionField &out,int dag)
 {
  DhopTotalTime-=usecond();
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else 
    DhopInternalSerialComms(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
@@ -374,6 +321,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 							DoubledGaugeField & U,
 							const FermionField &in, FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalOverlappedComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
@@ -382,53 +330,57 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  /////////////////////////////
  // Start comms  // Gather intranode and extra node differentiated??
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
-  st.HaloExchangeOptGather(in,compressor);
+    GRID_TRACE("Gather");
-  DhopFaceTime+=usecond();
+    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
  }
  DhopCommTime -=usecond();
  std::vector<std::vector<CommsRequest_t> > requests;
  auto id=traceStart("Communicate overlapped");
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
    GRID_TRACE("MergeSHM");
    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
-  DhopFaceTime+=usecond();
+  }
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  } else {
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
+  traceStop(id);
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
+  }
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  } else {
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 }
@@ -438,29 +390,30 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
 						    const FermionField &in, 
 						    FermionField &out,int dag)
 {
  GRID_TRACE("DhopInternalSerialComms");
  Compressor compressor(dag);
  int LLs = in.Grid()->_rdimensions[0];
-  DhopCommTime-=usecond();
+  {
    GRID_TRACE("HaloExchange");
    st.HaloExchangeOpt(in,compressor);
-  DhopCommTime+=usecond();
+  }
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  } else {
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -472,7 +425,6 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
  conformable(in.Grid(),out.Grid()); // drops the cb check
@@ -484,7 +436,6 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(),FermionGrid()); // verifies full grid
  conformable(in.Grid(),out.Grid());
@@ -539,12 +490,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  LatComplex    sk(_grid);  sk = Zero();
  LatComplex    sk2(_grid); sk2= Zero();
  LatComplex    W(_grid); W= Zero();
  LatComplex    a(_grid); a= Zero();
  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
  LatComplex 	cosha(_grid);
  LatComplex 	kmu(_grid);
  LatComplex 	Wea(_grid);
  LatComplex 	Wema(_grid);
  LatComplex 	ea(_grid);
  LatComplex 	ema(_grid);
  LatComplex 	eaLs(_grid);
  LatComplex 	emaLs(_grid);
  LatComplex 	ea2Ls(_grid);
  LatComplex 	ema2Ls(_grid);
  LatComplex 	sinha(_grid);
  LatComplex 	sinhaLs(_grid);
  LatComplex 	coshaLs(_grid);
@@ -579,39 +535,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  ////////////////////////////////////////////
  cosha = (one + W*W + sk) / (abs(W)*2.0);
-  // FIXME Need a Lattice acosh
+  ea = (cosha + sqrt(cosha*cosha-one));
  ema= (cosha - sqrt(cosha*cosha-one));
  eaLs = pow(ea,Ls);
  emaLs= pow(ema,Ls);
  ea2Ls = pow(ea,2.0*Ls);
  ema2Ls= pow(ema,2.0*Ls);
  Wea= abs(W) * ea;
  Wema= abs(W) * ema;
  //  a=log(ea);
-  {
+  sinha = 0.5*(ea - ema);
-    autoView(cosha_v,cosha,CpuRead);
+  sinhaLs = 0.5*(eaLs-emaLs);
-    autoView(a_v,a,CpuWrite);
+  coshaLs = 0.5*(eaLs+emaLs);
    for(int idx=0;idx<_grid->lSites();idx++){
      Coordinate lcoor(Nd);
      Tcomplex cc;
      //    RealD sgn;
      _grid->LocalIndexToLocalCoor(idx,lcoor);
      peekLocalSite(cc,cosha_v,lcoor);
      assert((double)real(cc)>=1.0);
      assert(fabs((double)imag(cc))<=1.0e-15);
      cc = ScalComplex(::acosh(real(cc)),0.0);
      pokeLocalSite(cc,a_v,lcoor);
    }
  }
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
  sinha = 0.5*(exp( a) - exp(-a));
  sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
  coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
-  F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
+  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
-  F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
+  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
  F = F - abs(W) * sinha * 4.0 * mass;
-  Bpp =  (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
+  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
-  Bmm =  (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
+  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
-  App =  (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
+  App =  (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one);
-  Amm =  (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
+  Amm =  (A/F) * (one - ea2Ls)  * ea  * (ea  - abs(W)) * (one - mass*mass * one);
  ABpm = (A/F) * abs(W) * sinha * 2.0  * (one + mass * coshaLs * 2.0 + mass*mass * one);
  //P+ source, P- source
@@ -634,29 +580,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
      buf1_4d = Zero();
      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
      //G(s,t)
-      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
+      bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d;
      //A++*exp(a(s+t))
-      bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A+-*exp(a(s-t))
-      bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ;
      //A-+*exp(a(-s+t))
-      bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ;
      //A--*exp(a(-s-t))
-      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ;
      //GL
      buf2_4d = Zero();
      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
      //G(s,t)
-      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
+      bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d;
      //B++*exp(a(s+t))
-      bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B+-*exp(a(s-t))
-      bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ;
      //B-+*exp(a(-s+t))
-      bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ;
      //B--*exp(a(-s-t))
-      bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ;
    }
    InsertSlice(bufR_4d, GR, (ss-1), 0);
    InsertSlice(bufL_4d, GL, (ss-1), 0);
@@ -775,28 +721,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
  W = one - M5 + sk2;
  ////////////////////////////////////////////
-  // Cosh alpha -> alpha
+  // Cosh alpha -> exp(+/- alpha)
  ////////////////////////////////////////////
  cosha =  (one + W*W + sk) / (abs(W)*2.0);
-  // FIXME Need a Lattice acosh
+  Wea = abs(W)*(cosha + sqrt(cosha*cosha-one));
-  {
+  Wema= abs(W)*(cosha - sqrt(cosha*cosha-one));
  autoView(cosha_v,cosha,CpuRead);
  autoView(a_v,a,CpuWrite);
  for(int idx=0;idx<_grid->lSites();idx++){
    Coordinate lcoor(Nd);
    Tcomplex cc;
    //    RealD sgn;
    _grid->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(cc,cosha_v,lcoor);
    assert((double)real(cc)>=1.0);
    assert(fabs((double)imag(cc))<=1.0e-15);
    cc = ScalComplex(::acosh(real(cc)),0.0);
    pokeLocalSite(cc,a_v,lcoor);
  }}
  Wea = ( exp( a) * abs(W)  );
  Wema= ( exp(-a) * abs(W)  );
  num   = num + ( one - Wema ) * mass * in;
  denom= ( Wea - one ) + mass*mass * (one - Wema); 
@@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2015
+Copyright (C) 2022
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -59,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
@@ -75,91 +79,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  StencilOdd.BuildSurfaceList(1,vol4);
 }
 template<class Impl>
 void WilsonFermion<Impl>::Report(void)
 {
  RealD NP = _grid->_Nprocessors;
  RealD NN = _grid->NodeCount();
  RealD volume = 1;
  Coordinate latt = _grid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls   : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion TotalTime   /Calls        : " << DhopTotalTime   / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime    /Calls        : " << DhopCommTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion FaceTime    /Calls        : " << DhopFaceTime    / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls        : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
    // Average the compute time
    _grid->GlobalSum(DhopComputeTime);
    DhopComputeTime/=NP;
    RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
    RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    // how to count flops here?
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call               ? : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node      ? : " << mflops/NP << std::endl;
    // how to count flops here?
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)        ? : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil"    <<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
 void WilsonFermion<Impl>::ZeroCounters(void) {
  DhopCalls       = 0; // ok
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DhopComputeTime2= 0;
  DhopFaceTime    = 0;
  DhopTotalTime   = 0;
  DerivCalls       = 0; // ok
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -319,7 +238,6 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
  DerivCalls++;
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@@ -328,11 +246,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
  FermionField Atilde(B.Grid());
  Atilde = A;
  DerivCommTime-=usecond();
  st.HaloExchange(B, compressor);
  DerivCommTime+=usecond();
  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
@@ -340,7 +255,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    int gamma = mu;
    if (!dag) gamma += Nd;
    DerivDhopComputeTime -= usecond();
    int Ls=1;
    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@@ -348,9 +262,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
    DerivDhopComputeTime += usecond();
  }
  DerivComputeTime += usecond();
 }
 template <class Impl>
@@ -397,7 +309,6 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());
@@ -409,7 +320,6 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -422,7 +332,6 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check
@@ -487,14 +396,12 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
  DhopTotalTime-=usecond();
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
  else
 #endif
    DhopInternalSerial(st,lo,U,in,out,dag);
  DhopTotalTime+=usecond();
 }
 template <class Impl>
@@ -503,6 +410,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
 						      const FermionField &in,
 						      FermionField &out, int dag)
 {
  GRID_TRACE("DhopOverlapped");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
@@ -513,53 +421,55 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
  /////////////////////////////
  std::vector<std::vector<CommsRequest_t> > requests;
  st.Prepare();
-  DhopFaceTime-=usecond();
+  {
    GRID_TRACE("Gather");
    st.HaloGather(in,compressor);
-  DhopFaceTime+=usecond();
+  }
-  DhopCommTime -=usecond();
+  tracePush("Communication");
  st.CommunicateBegin(requests);
  /////////////////////////////
  // Overlap with comms
  /////////////////////////////
-  DhopFaceTime-=usecond();
+  {
    GRID_TRACE("MergeSHM");
    st.CommsMergeSHM(compressor);
-  DhopFaceTime+=usecond();
+  }
  /////////////////////////////
  // do the compute interior
  /////////////////////////////
  int Opt = WilsonKernelsStatic::Opt;
  DhopComputeTime-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagInterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  } else {
    GRID_TRACE("DhopInterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
  }
  DhopComputeTime+=usecond();
  /////////////////////////////
  // Complete comms
  /////////////////////////////
  st.CommunicateComplete(requests);
-  DhopCommTime   +=usecond();
+  tracePop("Communication");
-  DhopFaceTime-=usecond();
+  {
    GRID_TRACE("Merge");
    st.CommsMerge(compressor);
-  DhopFaceTime+=usecond();
+  }
  /////////////////////////////
  // do the compute exterior
  /////////////////////////////
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDagExterior");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  } else {
    GRID_TRACE("DhopExterior");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
  }
  DhopComputeTime2+=usecond();
 };
@@ -569,20 +479,22 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       const FermionField &in,
                                       FermionField &out, int dag)
 {
  GRID_TRACE("DhopSerial");
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
-  DhopCommTime-=usecond();
+  {
    GRID_TRACE("HaloExchange");
    st.HaloExchange(in, compressor);
-  DhopCommTime+=usecond();
+  }
  DhopComputeTime-=usecond();
  int Opt = WilsonKernelsStatic::Opt;
  if (dag == DaggerYes) {
    GRID_TRACE("DhopDag");
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  } else {
    GRID_TRACE("Dhop");
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
  }
  DhopComputeTime+=usecond();
 };
 /*Change ends */
@@ -599,11 +511,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp_shifted(UGrid);
  PropagatorField g5Lg5(UGrid);
  PropagatorField R(UGrid);
  PropagatorField gmuR(UGrid);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  g5Lg5=g5*q_in_1*g5;
  tmp_shifted=Cshift(q_in_2,mu,1);
  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
  gmuR=gmu*R;
  q_out=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
  tmp_shifted=Cshift(q_in_1,mu,1);
  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
  g5Lg5=g5*g5Lg5*g5;
  R=q_in_2;
  gmuR=gmu*R;
  q_out-=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
 }
@@ -617,9 +565,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  PropagatorField L(UGrid);
  PropagatorField zz (UGrid);
  zz=Zero();
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  tmp = Cshift(q_in,mu,1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
  tmp = q_in *lattice_cmplx;
  tmp = Cshift(tmp,mu,-1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
  tmp = -( Utmp + gmu*Utmp );
  // Mask the time
  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
    unsigned int t0 = 0;
    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
  } else {
    tmp = where((lcoor>=tmin+tshift),tmp,zz);
  }
  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }
 NAMESPACE_END(Grid);
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -110,10 +110,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define PERMUTE_DIR(dir)			\
  permute##dir(Chi_00,Chi_00);			\
-      permute##dir(Chi_01,Chi_01);\
+  permute##dir(Chi_01,Chi_01);			\
-      permute##dir(Chi_02,Chi_02);\
+  permute##dir(Chi_02,Chi_02);			\
  permute##dir(Chi_10,Chi_10);			\
-      permute##dir(Chi_11,Chi_11);\
+  permute##dir(Chi_11,Chi_11);			\
  permute##dir(Chi_12,Chi_12);
 #endif
@@ -371,10 +371,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;
 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
  {int ptype;					\
   SE=st.GetEntry(ptype,DIR,ss);		\
-  offset = SE->_offset;				\
+   auto offset = SE->_offset;			\
-  local  = SE->_is_local;			\
+   auto local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
+   auto perm   = SE->_permute;			\
   if ( local ) {				\
     LOAD_CHIMU(PERM);				\
     PROJ;					\
@@ -386,14 +387,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   }						\
   acceleratorSynchronise();			\
   MULT_2SPIN(DIR);				\
-  RECON;					
+   RECON;					}
 #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
-  SE=&st_p[DIR+8*ss];				\
+  { SE=&st_p[DIR+8*ss];						\
-  ptype=st_perm[DIR];				\
+  auto ptype=st_perm[DIR];					\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;					\
-  local  = SE->_is_local;			\
+  auto local  = SE->_is_local;					\
-  perm   = SE->_permute;			\
+  auto perm   = SE->_permute;					\
  if ( local ) {						\
    LOAD_CHIMU(PERM);						\
    PROJ;							\
@@ -405,24 +406,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  }								\
  acceleratorSynchronise();					\
  MULT_2SPIN(DIR);						\
-  RECON;					
+  RECON;					}
 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
+  { SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
+    auto ptype=st_perm[DIR];						\
    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
+    auto offset = SE->_offset;						\
-  perm   = SE->_permute;			\
+    auto perm   = SE->_permute;						\
    LOAD_CHIMU(PERM);							\
    PROJ;								\
    MULT_2SPIN(DIR);							\
-  RECON;					
+    RECON;					}
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
  { int ptype;						\
  SE=st.GetEntry(ptype,DIR,ss);				\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;					\
-  local  = SE->_is_local;			\
+  auto local  = SE->_is_local;					\
-  perm   = SE->_permute;			\
+  auto perm   = SE->_permute;					\
  if ( local ) {						\
    LOAD_CHIMU(PERM);						\
    PROJ;							\
@@ -437,18 +439,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    MULT_2SPIN(DIR);						\
    RECON;							\
  }								\
-  acceleratorSynchronise();			
+  acceleratorSynchronise();			}
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
  { int ptype;						\
  SE=st.GetEntry(ptype,DIR,ss);				\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;				\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
    LOAD_CHI;						\
    MULT_2SPIN(DIR);					\
    RECON;						\
    nmu++;						\
  }							\
-  acceleratorSynchronise();			
+  acceleratorSynchronise();			}
 #define HAND_RESULT(ss)					\
  {							\
@@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@@ -73,11 +73,6 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    int perm= SE->_permute;					\
    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
    spProj(chi,tmp);							\
  } else if ( st.same_node[Dir] ) {				\
    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
  acceleratorSynchronise();						\
  if (SE->_is_local || st.same_node[Dir] ) {			\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
    Recon(result, Uchi);						\
  }									\
@@ -85,7 +80,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
  SE = st.GetEntry(ptype, Dir, sF);				\
-  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+  if (!SE->_is_local ) {		\
    auto chi = coalescedRead(buf[SE->_offset],lane);		\
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
    Recon(result, Uchi);					\
@@ -416,19 +411,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }
 #define KERNEL_CALL_TMP(A) \
  const uint64_t    NN = Nsite*Ls;					\
  auto U_p = & U_v[0];							\
  auto in_p = & in_v[0];						\
  auto out_p = & out_v[0];						\
  auto st_p = st_v._entries_p;						\
  auto st_perm = st_v._permute_type;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p);	\
    });									\
  accelerator_barrier();
 #define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
@@ -440,12 +422,35 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
 #define KERNEL_CALL_EXT(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
      int sU = sF/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
    });									\
  accelerator_barrier();
 #define ASM_CALL(A)							\
-  thread_for( ss, Nsite, {						\
+  thread_for( sss, Nsite, {						\
    int ss = st.lo->Reorder(sss);					\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
  });
 #define ASM_CALL_SLICE(A)						\
  auto grid = in.Grid() ;						\
  int nt = grid->LocalDimensions()[4];					\
  int nxyz = Nsite/nt ;							\
  for(int t=0;t<nt;t++){						\
  thread_for( sss, nxyz, {						\
    int ss = t*nxyz+sss;						\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
    });}
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -459,11 +464,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifdef SYCL_HACK     
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
 #else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #endif     
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
@@ -474,8 +475,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
+     // dependent on result of merge
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
+     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
@@ -499,14 +502,16 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
   } else if( interior ) {
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
+     // Dependent on result of merge
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
+     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
@@ -0,0 +1,44 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Show More
+++ b/Show More