Merge branch 'feature/dirichlet' into feature/dirichlet-gparity

2025-09-19 01:31:04 +01:00 · 2022-07-01 12:12:50 -04:00
parent 4df1e0987f 588c2f3cb1
commit 1f903d9296
91 changed files with 3457 additions and 1258 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -44,14 +44,22 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 //disables nvcc specific warning in json.hpp
 #pragma nv_diag_suppress unsigned_compare_with_zero
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-
+#endif
 //Eigen only
 #endif
 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
--- a/Grid/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -14,7 +14,11 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress code_is_unreachable
 #else
 #pragma diag_suppress code_is_unreachable
 #endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -120,6 +120,9 @@ public:
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      GridStopWatch IterationTimer;
      IterationTimer.Start();
      c = cp;
      MatrixTimer.Start();
@@ -152,8 +155,14 @@ public:
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
-      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+      IterationTimer.Stop();
      if ( (k % 500) == 0 ) {
 	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
      } else { 
 	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
 		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
      }
      // Stopping condition
      if (cp <= rsq) {
@@ -170,13 +179,13 @@ public:
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;
-        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -44,7 +44,7 @@ public:
  using OperatorFunction<Field>::operator();
-  RealD   Tolerance;
+  //  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -325,7 +325,7 @@ public:
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
      IterationsToComplete = k;	
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -113,6 +113,42 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
    int Nevec = (int)evec_coarse.size();
    int Nsrc = (int)src.size();
    // make temp variables
    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
    //Preporcessing
    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    guess_coarse[j] = Zero();
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockProject(src_coarse[j],src[j],subspace);
    }
    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
    for (int i=0;i<Nevec;i++)
    {
      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
      const CoarseField & tmp = evec_coarse[i];
      for (int j=0;j<Nsrc;j++)
      {
        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
      }
    }
    //postprocessing
    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockPromote(guess_coarse[j],guess[j],subspace);
    guess[j].Checkerboard() = src[j].Checkerboard();
    }
  };
  };
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -372,7 +372,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;
-  if ( dox ) {
+  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
      ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
@@ -382,7 +382,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    }
  }
-  if (dor) {
+  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@@ -390,16 +390,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=bytes;
    } else {
    // TODO : make a OMP loop on CPU, call threaded bcopy
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
      //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    }
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
    this->StencilSendToRecvFromComplete(list,dir);
    list.resize(0);
  }
  return off_node_bytes;
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -125,6 +125,12 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj>
 typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
  typename vobj::scalar_object s;
  peekSite(s,l,site);
  return s;
 }        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -232,6 +232,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  // Might make all code paths go this way.
 #if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
@@ -241,15 +242,31 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, 1,{
+    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v[ss];
+	auto x_l = left_v(ss);
-	auto y_l = right_v[ss];
+	auto y_l = right_v(ss);
-	inner_tmp_v[ss]=innerProductD(x_l,y_l);
+	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
    });
  }
 #else
  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, nsimd,{
 	auto x_l = left_v(ss);
 	auto y_l = right_v(ss);
 	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
    });
  }
 #endif
  // This is in single precision and fails some tests
-  auto anrm = sum(inner_tmp_v,sites);  
+  auto anrm = sumD(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@@ -283,7 +300,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(x,y);
  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_typeD vector_type;
+  //  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  GridBase *grid = x.Grid();
@@ -295,17 +312,29 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-
+#if 0
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-  accelerator_for( ss, sites, 1,{
+  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v[ss]+b*y_v[ss];
+      auto tmp = a*x_v(ss)+b*y_v(ss);
-      inner_tmp_v[ss]=innerProductD(tmp,tmp);
+      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
-      z_v[ss]=tmp;
+      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 #else
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto tmp = a*x_v(ss)+b*y_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 #endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -424,9 +424,32 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
 #if 1
    thread_for( lidx, _grid->lSites(), {
 	int gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
 	Coordinate pcoor;
 	Coordinate lcoor;
 	Coordinate gcoor;
 	_grid->LocalIndexToLocalCoor(lidx,lcoor);
 	pcoor=_grid->ThisProcessorCoor();
 	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	assert(rank == _grid->ThisRank() );
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
    });
 #else
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
 	// Where is it?
 	int rank;
 	int o_idx;
@@ -443,6 +466,7 @@ public:
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
 #endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -65,6 +65,7 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
@@ -72,9 +73,10 @@ GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
-  GridLogError.Active(0);
+  GridLogError.Active(1);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogMemory.Active(0); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
@@ -83,7 +85,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogHMC.Active(1);
  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
+    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -183,6 +183,7 @@ extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
 extern GridLogger GridLogMemory;
 extern Colours    GridLogColours;
 std::string demangle(const char* name) ;
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -31,6 +31,7 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <string>
 #include <map>
 #include <pwd.h>
@@ -654,7 +655,8 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    ildgfmt.field     = std::string("su3gauge");
+    const std::string stNC = std::to_string( Nc ) ;
    ildgfmt.field          = std::string("su"+stNC+"gauge");
    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -871,7 +873,8 @@ class IldgReader : public GridLimeReader {
    } else { 
      assert(found_ildgFormat);
-      assert ( ildgFormat_.field == std::string("su3gauge") );
+      const std::string stNC = std::to_string( Nc ) ;
      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -879,7 +882,7 @@ class IldgReader : public GridLimeReader {
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
+      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -6,8 +6,8 @@
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  const int x=0;
+  assert( Nc < 4 && Nc > 1 ) ;
  const int y=1;
  const int z=2;
  for(int mu=0;mu<Nd;mu++){
    #if Nc == 2
      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
    #else
      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
    #endif
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -278,7 +282,6 @@ struct GaugeSimpleMunger{
 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -317,8 +320,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -330,8 +333,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
+      for(int i=0;i<Nc-1;i++){
-	for(int j=0;j<3;j++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -9,6 +9,7 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,6 +31,8 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <string>
 NAMESPACE_BEGIN(Grid);
 using namespace Grid;
@@ -147,15 +150,17 @@ public:
    std::string format(header.floating_point);
-    int ieee32big = (format == std::string("IEEE32BIG"));
+    const int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
+    const int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
+    const int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
+    const int ieee64    = (format == std::string("IEEE64") || \
 			   format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
+    const std::string stNC = std::to_string( Nc ) ;
    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -166,7 +171,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
+    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -211,27 +216,29 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label);
+    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
 					std::string ens_id = std::string("UKQCD"),
 					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
    FieldMetaData header;
-    ///////////////////////////////////////////
+    header.sequence_number = sequence_number;
-    // Following should become arguments
+    header.ensemble_id     = ens_id;
    ///////////////////////////////////////////
    header.sequence_number = 1;
    header.ensemble_id     = std::string("UKQCD");
    header.ensemble_label  = ens_label;
    header.hdr_version     = "1.0" ;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -245,10 +252,14 @@ public:
    uint64_t offset;
-    // Sod it -- always write 3x3 double
+    // Sod it -- always write NcxNc double
    header.floating_point  = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
+    const std::string stNC = std::to_string( Nc ) ;
-    GaugeSimpleUnmunger<fobj3D,sobj> munge;
+    if( two_row ) {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
    } else {
      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -256,8 +267,15 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    if( two_row ) {
      Gauge3x2unmunger<fobj2D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    } else {
      GaugeSimpleUnmunger<fobj3D,sobj> munge;
      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 						nersc_csum,scidac_csuma,scidac_csumb);
    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -290,7 +308,6 @@ public:
    MachineCharacteristics(header);
    uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
--- a/Grid/perfmon/PerfCount.h
+++ b/Grid/perfmon/PerfCount.h
@@ -72,17 +72,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -39,9 +39,9 @@ NAMESPACE_BEGIN(Grid)
 // C++11 time facilities better?
 inline double usecond(void) {
  struct timeval tv;
-#ifdef TIMERS_ON
+  tv.tv_sec = 0;
  tv.tv_usec = 0;
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
--- a/Grid/pugixml/pugixml.cc
+++ b/Grid/pugixml/pugixml.cc
@@ -16,8 +16,12 @@
 #ifdef __NVCC__
 #pragma push
 #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
 #endif
 #include "pugixml.h"
--- a/Grid/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -38,24 +38,32 @@ NAMESPACE_BEGIN(Grid);
 struct GparityWilsonImplParams {
  Coordinate twists;
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
-  GparityWilsonImplParams() : twists(Nd, 0) {};
+  Coordinate dirichlet; // Blocksize of dirichlet BCs
  GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {};
 };
 struct WilsonImplParams {
  bool overlapCommsCompute;
  Coordinate dirichlet; // Blocksize of dirichlet BCs
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
    dirichlet.resize(Nd,0);
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
    dirichlet.resize(Nd,0);
  }
 };
 struct StaggeredImplParams {
-  StaggeredImplParams()  {};
+  Coordinate dirichlet; // Blocksize of dirichlet BCs
  StaggeredImplParams()
  {
    dirichlet.resize(Nd,0);
  };
 };
  struct OneFlavourRationalParams : Serializable {
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -68,9 +68,17 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  virtual RealD Mass(void) { return mass; };
+  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
  RealD MassPlus(void) { return mass_plus; };
  RealD MassMinus(void) { return mass_minus; };
  void  SetMass(RealD _mass) { 
-    mass=_mass; 
+    mass_plus=mass_minus=_mass; 
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
    mass_plus=_mass_plus;
    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@@ -108,7 +116,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
  //    protected:
-  RealD mass;
+  RealD mass_plus, mass_minus;
  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -0,0 +1,435 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
    Copyright (C) 2017 - 2022
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 ////////////////////////////////////////////
 // Standard Clover
 //   (4+m0) + csw * clover_term
 // Exp Clover
 //   (4+m0) * exp(csw/(4+m0) clover_term)
 //   = (4+m0) + csw * clover_term + ...
 ////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);
 //////////////////////////////////
 // Generic Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
    GridBase *grid = CloverTerm.Grid();
    CloverTerm += diag_mass;
    int lvol = grid->lSites();
    int DimRep = Impl::Dimension;
    {
      autoView(CTv,CloverTerm,CpuRead);
      autoView(CTIv,CloverTermInv,CpuWrite);
      thread_for(site, lvol, {
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
        peekLocalSite(Qx, CTv, lcoor);
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++){
                auto zz =  Qx()(j, k)(a, b);
                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
              }
        EigenInvCloverOp = EigenCloverOp.inverse();
        for (int j = 0; j < Ns; j++)
          for (int k = 0; k < Ns; k++)
            for (int a = 0; a < DimRep; a++)
              for (int b = 0; b < DimRep; b++)
                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
               pokeLocalSite(Qxinv, CTIv, lcoor);
      });
    }
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Generic Exp Clover
 //////////////////////////////////
 template<class Impl>
 class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef WilsonCloverHelpers<Impl> Helpers;
  // Can this be avoided?
  static void IdentityTimesC(const CloverField& in, RealD c) {
    int DimRep = Impl::Dimension;
    autoView(in_v, in, AcceleratorWrite);
    accelerator_for(ss, in.Grid()->oSites(), 1, {
      for (int sa=0; sa<Ns; sa++)
        for (int ca=0; ca<DimRep; ca++)
          in_v[ss]()(sa,sa)(ca,ca) = c;
    });
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Clover.Grid();
    CloverField ExpClover(grid);
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
    Clover *= (1.0/diag_mass);
    // Taylor expansion, slow but generic
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
    // qN = cN
    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * Clover + cn[i];
    // prepare inverse
    CloverInv = (-1.0)*Clover;
    Clover = ExpClover * diag_mass;
    ExpClover = Zero();
    IdentityTimesC(ExpClover, cn[NMAX]);
    for (int i=NMAX-1; i>=0; i--)
      ExpClover = ExpClover * CloverInv + cn[i];
    CloverInv = ExpClover * (1.0/diag_mass);
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 //////////////////////////////////
 // Compact Standard Clover
 //////////////////////////////////
 template<class Impl>
 class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
                            public WilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    Clover += diag_mass;
  }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
                          CloverTriangleField& Triangle,
                          RealD csw_t, RealD diag_mass) {
    // Do nothing
  }
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    return Helpers::Cmunu(U, lambda, mu, nu);
  }
 };
 //////////////////////////////////
 // Compact Exp Clover
 //////////////////////////////////
 template<class Impl>
 class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
 public:
  INHERIT_IMPL_TYPES(Impl);
  INHERIT_CLOVER_TYPES(Impl);
  INHERIT_COMPACT_CLOVER_TYPES(Impl);
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
    // do nothing!
    // mass term is multiplied to exp(Clover) below
  }
  static int getNMAX(RealD prec, RealD R) {
    /* compute stop condition for exponential */
    int NMAX=1;
    RealD cond=R*R/2.;
    while (cond*std::exp(R)>prec) {
      NMAX++;
      cond*=R/(double)(NMAX+1);
    }
    return NMAX;
  }
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
  	  typedef iMatrix<ComplexD,6> mat;
  	  RealD qn[6];
  	  RealD qnold[6];
  	  RealD p[5];
  	  RealD trA2, trA3, trA4;
  	  mat A2, A3, A4, A5;
  	  A2 = alpha * alpha * arg * arg;
  	  A3 = alpha * arg * A2;
  	  A4 = A2 * A2;
  	  A5 = A2 * A3;
  	  trA2 = toReal( trace(A2) );
  	  trA3 = toReal( trace(A3) );
  	  trA4 = toReal( trace(A4));
  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
  	  p[3] = trA3 / 3.0;
  	  p[4] = 0.5 * trA2;
  	  qnold[0] = cN[Niter];
  	  qnold[1] = 0.0;
  	  qnold[2] = 0.0;
  	  qnold[3] = 0.0;
  	  qnold[4] = 0.0;
  	  qnold[5] = 0.0;
  	  for(int i = Niter-1; i >= 0; i--)
  	  {
  	   qn[0] = p[0] * qnold[5] + cN[i];
  	   qn[1] = p[1] * qnold[5] + qnold[0];
  	   qn[2] = p[2] * qnold[5] + qnold[1];
  	   qn[3] = p[3] * qnold[5] + qnold[2];
  	   qn[4] = p[4] * qnold[5] + qnold[3];
  	   qn[5] = qnold[4];
  	   qnold[0] = qn[0];
  	   qnold[1] = qn[1];
  	   qnold[2] = qn[2];
  	   qnold[3] = qn[3];
  	   qnold[4] = qn[4];
  	   qnold[5] = qn[5];
  	  }
  	  mat unit(1.0);
  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
    }
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
    GridBase* grid = Diagonal.Grid();
    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
    //
    // Implementation completely in Daniel's layout
    //
    // Taylor expansion with Cayley-Hamilton recursion
    // underlying Horner scheme as above
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
    for (int i=1; i<=NMAX; i++){
      cn[i] = cn[i-1] / RealD(i);
    }
      // Taken over from Daniel's implementation
      conformable(Diagonal, Triangle);
      long lsites = grid->lSites();
    {
      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
      typedef iMatrix<ComplexD,6> mat;
      autoView(diagonal_v,  Diagonal,  CpuRead);
      autoView(triangle_v,  Triangle,  CpuRead);
      autoView(diagonalExp_v, Diagonal, CpuWrite);
      autoView(triangleExp_v, Triangle, CpuWrite);
      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
    	  mat srcCloverOpUL(0.0); // upper left block
    	  mat srcCloverOpLR(0.0); // lower right block
    	  mat ExpCloverOp;
        scalar_object_diagonal diagonal_tmp     = Zero();
        scalar_object_diagonal diagonal_exp_tmp = Zero();
        scalar_object_triangle triangle_tmp     = Zero();
        scalar_object_triangle triangle_exp_tmp = Zero();
        Coordinate lcoor;
        grid->LocalIndexToLocalCoor(site, lcoor);
        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
        peekLocalSite(triangle_tmp, triangle_v, lcoor);
        int block;
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
        		if (i == j){
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
        		}
        		else{
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
        		}
        	}
        }
        block = 1;
        for(int i = 0; i < 6; i++){
          	for(int j = 0; j < 6; j++){
           		if (i == j){
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
           		}
           		else{
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
           		}
            }
        }
        // exp(Clover)
        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 0;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
            	if (i == j){
            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
            	}
            	else if(i < j){
            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
            	}
           	}
        }
        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
        block = 1;
        for(int i = 0; i < 6; i++){
        	for(int j = 0; j < 6; j++){
              	if (i == j){
              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
               	}
               	else if(i < j){
               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
               	}
            }
        }
        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
      });
    }
    Diagonal *= diag_mass;
    Triangle *= diag_mass;
  }
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
    return lambda;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@@ -31,6 +31,7 @@
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@@ -85,7 +86,7 @@ NAMESPACE_BEGIN(Grid);
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site
-template<class Impl>
+template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -138,38 +138,52 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 // Clover fermions
-typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
-typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
 typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
-typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
-typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 // Compact Clover fermions
-typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
+template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
-typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
+template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
 typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
-typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
+typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
-typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
+typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
-typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
+typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
+typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
+typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
+typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
 typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -49,7 +49,7 @@ public:
  virtual FermionField &tmp(void) = 0;
-  virtual void DirichletBlock(Coordinate & _Block) { assert(0); };
+  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -32,6 +32,7 @@
 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
@@ -51,7 +52,7 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
-template <class Impl>
+template<class Impl, class CloverHelpers>
 class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@@ -209,6 +209,8 @@ public:
 };
 ////////////////////////////////////////////////////////
 template<class Impl> class CompactWilsonCloverHelpers {
 public:
--- a/Grid/qcd/action/fermion/WilsonCloverTypes.h
+++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h
@@ -47,8 +47,6 @@ class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);
  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -297,7 +297,7 @@ public:
  void ZeroCountersi(void)  {  }
  void Reporti(int calls)  {  }
-  std::vector<int> surface_list;
+  //  Vector<int> surface_list;
  WilsonStencil(GridBase *grid,
 		int npoints,
@@ -307,10 +307,11 @@ public:
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
    ZeroCountersi();
-    surface_list.resize(0);
+    //    surface_list.resize(0);
    this->same_node.resize(npoints);
  };
  /*
  void BuildSurfaceList(int Ls,int vol4){
    // find same node for SHM
@@ -331,6 +332,7 @@ public:
      }
    }
  }
  */
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -178,16 +178,8 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());
-  virtual void DirichletBlock(Coordinate & block)
+  virtual void DirichletBlock(const Coordinate & block)
  {
    assert(block.size()==Nd+1);
    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
      Dirichlet = 1;
      Block = block;
      Stencil.DirichletBlock(block); 
      StencilEven.DirichletBlock(block); 
      StencilOdd.DirichletBlock(block);
    }
  }
  // Constructors
  /*
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass(_mass)
+  mass_plus(_mass), mass_minus(_mass)
 { 
 }
@@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass*cee[Ls-1];
+      lower[s] = mass_minus*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass*cee[0];
+      upper[s] = mass_plus*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass*upper[Ls-1];
+  upper[Ls-1]=-mass_plus*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
@@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass*cs[Ls-1];
+      lower[s] =-mass_minus*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass*cs[0];
+      upper[s] =-mass_plus*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      leem[i]=mass*cee[Ls-1]/bee[0];
+      leem[i]=mass_minus*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      ueem[i]=mass;
+      ueem[i]=mass_plus;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
@@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
  { 
-    Coeff_t delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass_minus*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@@ -642,6 +642,10 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -777,6 +781,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  assert(mass_plus == mass_minus);
  RealD mass = mass_plus;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -66,18 +66,17 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  M5Dcalls++;
  M5Dtime-=usecond();
-  uint64_t nloop = grid->oSites()/Ls;
+  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss= sss*Ls;
+    uint64_t s = sss%Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1, tmp2;
    for(int s=0;s<Ls;s++){
    uint64_t idx_u = ss+((s+1)%Ls);
    uint64_t idx_l = ss+((s+Ls-1)%Ls);
    spProj5m(tmp1,psi(idx_u));
    spProj5p(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
@@ -108,18 +107,17 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  M5Dcalls++;
  M5Dtime-=usecond();
-  uint64_t nloop = grid->oSites()/Ls;
+  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss=sss*Ls;
+    uint64_t s = sss%Ls;
    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1,tmp2;
    for(int s=0;s<Ls;s++){
    uint64_t idx_u = ss+((s+1)%Ls);
    uint64_t idx_l = ss+((s+Ls-1)%Ls);
    spProj5p(tmp1,psi(idx_u));
    spProj5m(tmp2,psi(idx_l));
    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
 }
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -32,9 +32,10 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 NAMESPACE_BEGIN(Grid);
-template<class Impl>
+template<class Impl, class CloverHelpers>
-CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
+CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
                                                                            GridCartesian& Fgrid,
                                                                            GridRedBlackCartesian& Hgrid,
                                                                            const RealD _mass,
@@ -58,50 +59,55 @@ CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;
  ImportGauge(_Umu);
-  if (open_boundaries)
+  if (open_boundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
  }
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
@@ -109,8 +115,8 @@ void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& o
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
@@ -118,20 +124,20 @@ void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
@@ -144,13 +150,13 @@ void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionFiel
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
@@ -163,23 +169,23 @@ void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionF
  if(open_boundaries) ApplyBoundaryMask(out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!open_boundaries); // TODO check for changes required for open bc
  // NOTE: code copied from original clover term
@@ -251,7 +257,7 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }
@@ -261,18 +267,18 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
  force += clover_force;
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
@@ -285,8 +291,8 @@ void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }
-template<class Impl>
+template<class Impl, class CloverHelpers>
-void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation
  // Import gauge into base class
@@ -318,22 +324,27 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
-  TmpOriginal += this->diag_mass;
+  // Handle mass term based on clover policy
  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
  // Convert the data layout of the clover term
  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
-  // Possible modify the boundary values
+  // Exponentiate the clover (nothing happens in case of the standard clover)
  double t5 = usecond();
  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
  // Possible modify the boundary values
  double t6 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
-  // Invert the clover term in the improved layout
+  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
-  double t6 = usecond();
+  double t7 = usecond();
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
  // Fill the remaining clover fields
-  double t7 = usecond();
+  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
@@ -344,20 +355,19 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);
  // Report timings
-  double t8 = usecond();
+  double t9 = usecond();
-#if 0
+
-  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
-            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
-            << ", allocations = "               << (t2 - t1) / 1e6
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
-            << ", field strength = "            << (t3 - t2) / 1e6
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
-            << ", fill clover = "               << (t4 - t3) / 1e6
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-            << ", convert = "                   << (t5 - t4) / 1e6
+  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
-            << ", boundaries = "                << (t6 - t5) / 1e6
+  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
-            << ", inversions = "                << (t7 - t6) / 1e6
+  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
-            << ", pick cbs = "                  << (t8 - t7) / 1e6
+  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
-            << ", total = "                     << (t8 - t0) / 1e6
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
-            << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 #endif
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -34,8 +34,8 @@
 NAMESPACE_BEGIN(Grid);
-template<class Impl>
+template<class Impl, class CloverHelpers>
-WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
+WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
@@ -74,8 +74,8 @@ WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&
 }
 // *NOT* EO
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -89,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());
@@ -104,8 +104,8 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
  out += temp;
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
@@ -131,47 +131,11 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;
  double t4 = usecond();
-  int lvol = _Umu.Grid()->lSites();
+  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
  int DimRep = Impl::Dimension;
  double t5 = usecond();
  {
    autoView(CTv,CloverTerm,CpuRead);
    autoView(CTIv,CloverTermInv,CpuWrite);
    thread_for(site, lvol, {
      Coordinate lcoor;
      grid->LocalIndexToLocalCoor(site, lcoor);
      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
      peekLocalSite(Qx, CTv, lcoor);
      //if (csw!=0){
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++){
 	      auto zz =  Qx()(j, k)(a, b);
 	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 	    }
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
      EigenInvCloverOp = EigenCloverOp.inverse();
      //std::cout << EigenInvCloverOp << std::endl;
      for (int j = 0; j < Ns; j++)
 	for (int k = 0; k < Ns; k++)
 	  for (int a = 0; a < DimRep; a++)
 	    for (int b = 0; b < DimRep; b++)
 	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
      //  }
      pokeLocalSite(Qxinv, CTIv, lcoor);
    });
  }
  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -184,48 +148,44 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-  double t7 = usecond();
+  double t6 = usecond();
-#if 0
+  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
-  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
-            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
-            << ", allocations = "               << (t2 - t1) / 1e6
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
-            << ", field strength = "            << (t3 - t2) / 1e6
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-            << ", fill clover = "               << (t4 - t3) / 1e6
+  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
-            << ", misc = "                      << (t5 - t4) / 1e6
+  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
-            << ", inversions = "                << (t6 - t5) / 1e6
+  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
            << ", pick cbs = "                  << (t7 - t6) / 1e6
            << ", total = "                     << (t7 - t0) / 1e6
            << std::endl;
 #endif
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
  CloverField *Clover;
@@ -278,8 +238,8 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
 } // MooeeInternal
 // Derivative parts unpreconditioned pseudofermions
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
@@ -349,7 +309,7 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }
@@ -360,15 +320,15 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
-template <class Impl>
+template<class Impl, class CloverHelpers>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -92,6 +92,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
  }
  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
      Dirichlet = 1;
      Block = block;
    }
  } else {
    Coordinate block(Nd+1,0);
    Block = block;
  }
  ZeroCounters();
  if (Impl::LsVectorised) { 
    int nsimd = Simd::Nsimd();
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2015
+Copyright (C) 2022
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -599,11 +600,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp_shifted(UGrid);
  PropagatorField g5Lg5(UGrid);
  PropagatorField R(UGrid);
  PropagatorField gmuR(UGrid);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  g5Lg5=g5*q_in_1*g5;
  tmp_shifted=Cshift(q_in_2,mu,1);
  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
  gmuR=gmu*R;
  q_out=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
  tmp_shifted=Cshift(q_in_1,mu,1);
  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
  g5Lg5=g5*g5Lg5*g5;
  R=q_in_2;
  gmuR=gmu*R;
  q_out-=adj(g5Lg5)*R;
  q_out-=adj(g5Lg5)*gmuR;
 }
@@ -617,9 +654,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
  if(curr_type != Current::Vector)
  {
    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
    exit(1);
  }
  int tshift = (mu == Nd-1) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  PropagatorField L(UGrid);
  PropagatorField zz (UGrid);
  zz=Zero();
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
    Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
  };
  Gamma gmu=Gamma(Gmu[mu]);
  tmp = Cshift(q_in,mu,1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
  tmp = q_in *lattice_cmplx;
  tmp = Cshift(tmp,mu,-1);
  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
  tmp = -( Utmp + gmu*Utmp );
  // Mask the time
  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
    unsigned int t0 = 0;
    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
  } else {
    tmp = where((lcoor>=tmin+tshift),tmp,zz);
  }
  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -440,6 +440,17 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
 #define KERNEL_CALL_EXT(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  const uint64_t    sz = st.surface_list.size();			\
  auto ptr = &st.surface_list[0];					\
  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
      int sF = ptr[ss];							\
      int sU = ss/Ls;							\
      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
    });									\
  accelerator_barrier();
 #define ASM_CALL(A)							\
  thread_for( ss, Nsite, {						\
    int sU = ss;							\
--- a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
@@ -9,6 +9,7 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -32,10 +33,12 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class CompactWilsonCloverFermion<IMPLEMENTATION>; 
+template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
 template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/WilsonCloverFermionInstantiation.cc.master
@@ -8,6 +8,7 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Mattia Bruno <mattia.bruno@cern.ch>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,10 +32,12 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 #include <Grid/qcd/action/fermion/CloverHelpers.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION>; 
+template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
 template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@@ -1,51 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015, 2020
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #ifndef AVX512
 #ifndef QPX
 #ifndef A64FX
 #ifndef A64FXFIXEDSIZE
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #endif
 #endif
 #endif
 #endif
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonKernels<IMPLEMENTATION>;
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -18,6 +18,10 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "
 COMPACT_WILSON_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD "
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@@ -40,7 +44,7 @@ EOF
 done
-CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
 for impl in $WILSON_IMPL_LIST
 do
@@ -50,6 +54,16 @@ do
 done
 done
 CC_LIST="CompactWilsonCloverFermionInstantiation"
 for impl in $COMPACT_WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done
 CC_LIST=" \
  CayleyFermion5DInstantiation \
  ContinuedFractionFermion5DInstantiation \
--- a/Grid/qcd/action/filters/DirichletFilter.h
+++ b/Grid/qcd/action/filters/DirichletFilter.h
@@ -53,9 +53,9 @@ struct DirichletFilter: public MomentumFilterBase<MomentaField>
    LatticeInteger coor(grid); 
    LatCM zz(grid); zz = Zero();
    for(int mu=0;mu<Nd;mu++) {
-      if ( (Block[mu]) && (Block[mu] < grid->GlobalDimensions()[mu] ) ) {
+      if ( (Block[mu]) && (Block[mu] <= grid->GlobalDimensions()[mu] ) ) {
 	// If costly could provide Grid earlier and precompute masks
-	std::cout << " Dirichlet in mu="<<mu<<std::endl;
+	std::cout << GridLogMessage << " Dirichlet in mu="<<mu<<std::endl;
 	LatticeCoordinate(coor,mu);
 	auto P_mu = PeekIndex<LorentzIndex>(P, mu);
 	P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -145,7 +145,7 @@ protected:
      MomFilter->applyFilter(force);
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl;
-      //      DumpSliceNorm("force ",force,Nd-1);
+      DumpSliceNorm("force ",force,Nd-1);
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@@ -72,14 +72,13 @@ public:
  //Fix the gauge field Umu
  //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
-    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
+    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
  }
-
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
  //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
    GridBase *grid = Umu.Grid();
@@ -141,6 +140,8 @@ public:
      }
    }
    std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
    if (err_on_no_converge)
      assert(0 && "Gauge fixing did not converge within the specified number of iterations");
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -215,7 +215,7 @@ public:
    double vol = Umu.Grid()->gSites();
-    return p.real() / vol / 4.0 / 3.0;
+    return p.real() / vol / (4.0 * Nc ) ;
  };
  //////////////////////////////////////////////////
--- a/Grid/stencil/SimpleCompressor.h
+++ b/Grid/stencil/SimpleCompressor.h
@@ -52,6 +52,11 @@ public:
    return arg;
  }
 };
 class SimpleStencilParams{
 public:
  Coordinate dirichlet;
  SimpleStencilParams() {};
 };
 NAMESPACE_END(Grid);
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -131,7 +131,6 @@ class CartesianStencilAccelerator {
  int           _checkerboard;
  int           _npoints; // Move to template param?
  int           _osites;
  int           _dirichlet;
  StencilVector _directions;
  StencilVector _distances;
  StencilVector _comms_send;
@@ -503,7 +502,6 @@ public:
  }
  void AddCopy(void *from,void * to, Integer bytes)
  {
    //    std::cout << "Adding CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<bytes<<std::endl;
    CopyReceiveBuffer obj;
    obj.from_p = from;
    obj.to_p = to;
@@ -517,7 +515,7 @@ public:
      cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
      cobj *to  =(cobj *)CopyReceiveBuffers[i].to_p;
      Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
-      //    std::cout << "CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<words*sizeof(cobj)<<std::endl;
+
      accelerator_forNB(j, words, cobj::Nsimd(), {
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
@@ -543,13 +541,12 @@ public:
 	   &&(CachedTransfers[i].lane       ==lane)
 	   &&(CachedTransfers[i].cb         ==cb)
 	     ){
-	//	std::cout << "Found duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<< " bytes "<<bytes <<std::endl;
+
 	AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
 	return 1;
      }
    }
    //    std::cout << "No duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<<"  bytes "<<bytes<<std::endl;
    CachedTransfers.push_back(obj);
    return 0;
  }
@@ -643,23 +640,23 @@ public:
 	}
      }
      if(local == 0) {
-	surface_list.push_back(site);
+	for(int s=0;s<Ls;s++){
 	  surface_list.push_back(site*Ls+s);
 	}
      }
    }
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
  {
    this->_dirichlet = 1;
    for(int ii=0;ii<this->_npoints;ii++){
      int dimension    = this->_directions[ii];
      int displacement = this->_distances[ii];
      int shift = displacement;
      int gd = _grid->_gdimensions[dimension];
      int fd = _grid->_fdimensions[dimension];
      int pd = _grid->_processors [dimension];
      int ld = gd/pd;
      int pc = _grid->_processor_coor[dimension];
      int ld = fd/pd;
      ///////////////////////////////////////////
      // Figure out dirichlet send and receive
      // on this leg of stencil.
@@ -668,25 +665,25 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      if ( block ) {
+      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
-      
+	// Quiesce communication across block boundaries
 	if( displacement > 0 ) {
 	  // High side, low side
 	  // | <--B--->|
 	  // |    |    |
 	  //           noR
 	  // noS
-	  if ( (ld*(pc+1) ) % block == 0 ) this->_comms_recv[ii] = 0;
+	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_recv[ii] = 0;
-	  if ( ( ld*pc ) % block == 0    ) this->_comms_send[ii] = 0;
+	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	} else {
 	  // High side, low side
 	  // | <--B--->|
 	  // |    |    |
 	  //           noS
 	  // noR
-	  if ( (ld*(pc+1) ) % block == 0 ) this->_comms_send[ii] = 0;
+	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
-	  if ( ( ld*pc ) % block    == 0 ) this->_comms_recv[ii] = 0;
+	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
      }
    }
@@ -698,7 +695,6 @@ public:
 		   const std::vector<int> &distances,
 		   Parameters p)
  {
    this->_dirichlet = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@@ -715,6 +711,8 @@ public:
    this->_comms_recv.resize(npoints); 
    this->same_node.resize(npoints);
    if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up
    _unified_buffer_size=0;
    surface_list.resize(0);
@@ -734,7 +732,7 @@ public:
      int gd = _grid->_gdimensions[dimension];
      int fd = _grid->_fdimensions[dimension];
      int pd = _grid->_processors [dimension];
-      int ld = gd/pd;
+      //      int ld = gd/pd;
      int rd = _grid->_rdimensions[dimension];
      int pc = _grid->_processor_coor[dimension];
      this->_permute_type[point]=_grid->PermuteType(dimension);
@@ -746,9 +744,6 @@ public:
      int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
      int rotate_dim      = _grid->_simd_layout[dimension]>2;
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
      assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
      int sshift[2];
@@ -878,12 +873,14 @@ public:
    for(int x=0;x<rd;x++){
      int permute_type=grid->PermuteType(dimension);
      int permute_slice;
      int sx        =  (x+sshift)%rd;
      int offnode = 0;
      if ( simd_layout > 1 ) {
 	permute_slice=1;
 	for(int i=0;i<Nsimd;i++){
 	  int inner_bit = (Nsimd>>(permute_type+1));
@@ -900,6 +897,7 @@ public:
      } else {
 	int comm_proc = ((x+sshift)/rd)%pd;
 	offnode = (comm_proc!= 0);
 	permute_slice=0;
      }
      int wraparound=0;
@@ -911,25 +909,29 @@ public:
      }
      // Wrap locally dirichlet support case OR node local
-      if ( (offnode==0) || (comms_recv==0)  ) {
+      if ( offnode==0 ) {
-	int permute_slice=0;
+	permute_slice=0;
 	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
      } else {
 	if ( comms_recv ) {
 	  ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase
 	} else { 
 	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
 	}
      }
      if ( offnode ) {
 	int words = buffer_size;
 	if (cbmask != 0x3) words=words>>1;
 	//	int rank           = grid->_processor;
 	//	int recv_from_rank;
 	//	int xmit_to_rank;
 	int unified_buffer_offset = _unified_buffer_size;
 	_unified_buffer_size    += words;
 	ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
      }
    }
  }
@@ -1061,71 +1063,75 @@ public:
      if (comm_proc) {
 	int words = buffer_size;
 	if (cbmask != 0x3) words=words>>1;
 	int bytes =  words * compress.CommDatumSize();
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
 	int comm_off = u_comm_offset;
 	int recv_from_rank;
 	int xmit_to_rank;
 	cobj *recv_buf;
 	cobj *send_buf;
 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 	assert (xmit_to_rank   != _grid->ThisRank());
 	assert (recv_from_rank != _grid->ThisRank());
 	if( comms_send ) {
 	  if ( !face_table_computed ) {
 	    face_table.resize(face_idx+1);
 	    std::vector<std::pair<int,int> >  face_table_host ;
-	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
+	    Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
 	    face_table[face_idx].resize(face_table_host.size());
 	    acceleratorCopyToDevice(&face_table_host[0],
 				    &face_table[face_idx][0],
 				    face_table[face_idx].size()*sizeof(face_table_host[0]));
 	  }
 	//      	int rank           = _grid->_processor;
 	int recv_from_rank;
 	int xmit_to_rank;
 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 	assert (xmit_to_rank   != _grid->ThisRank());
 	assert (recv_from_rank != _grid->ThisRank());
 	cobj *recv_buf;
 	  if ( compress.DecompressionStep() ) {
 	    recv_buf=u_simd_recv_buf[0];
 	  } else {
 	    recv_buf=this->u_recv_buf_p;
 	  }
 	cobj *send_buf;
 	  send_buf = this->u_send_buf_p; // Gather locally, must send
 	  ////////////////////////////////////////////////////////
 	  // Gather locally
 	  ////////////////////////////////////////////////////////
 	  assert(send_buf!=NULL);
 	if ( comms_send ) 
 	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);
 	face_idx++;
-	int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask);
+	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
 	}
 	int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
 	if ( (!duplicate) ) { // Force comms for now
 	  ///////////////////////////////////////////////////////////
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  AddPacket((void *)&send_buf[u_comm_offset],
+	  AddPacket((void *)&send_buf[comm_off],
-		    (void *)&recv_buf[u_comm_offset],
+		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, comms_send,
 		    recv_from_rank, comms_recv,
 		    bytes);
 	}
-	if ( compress.DecompressionStep()  ) {
+	if ( compress.DecompressionStep()  && comms_recv ) {
-	  AddDecompress(&this->u_recv_buf_p[u_comm_offset],
+	  AddDecompress(&this->u_recv_buf_p[comm_off],
-			&recv_buf[u_comm_offset],
+			&recv_buf[comm_off],
 			words,Decompressions);
 	}
 	u_comm_offset+=words;
 	face_idx++;
      }
    }
    return 0;
@@ -1155,7 +1161,6 @@ public:
    int permute_type=_grid->PermuteType(dimension);
    //    std::cout << "SimdNew permute type "<<permute_type<<std::endl;
    ///////////////////////////////////////////////
    // Simd direction uses an extract/merge pair
@@ -1189,8 +1194,9 @@ public:
      if ( any_offnode ) {
 	int comm_off = u_comm_offset;
 	for(int i=0;i<maxl;i++){
-	  spointers[i] = (cobj *) &u_simd_send_buf[i][u_comm_offset];
+	  spointers[i] = (cobj *) &u_simd_send_buf[i][comm_off];
 	}
 	int sx   = (x+sshift)%rd;
@@ -1199,14 +1205,14 @@ public:
 	  face_table.resize(face_idx+1);
 	  std::vector<std::pair<int,int> >  face_table_host ;
-	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
+	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
 	  face_table[face_idx].resize(face_table_host.size());
 	  acceleratorCopyToDevice(&face_table_host[0],
 				  &face_table[face_idx][0],
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
 	}
-	//	if ( comms_send )
+	if ( comms_send || comms_recv )
 	  Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 	face_idx++;
@@ -1226,8 +1232,8 @@ public:
 	  int nbr_plane = nbr_ic;
 	  assert (sx == nbr_ox);
-	  auto rp = &u_simd_recv_buf[i        ][u_comm_offset];
+	  auto rp = &u_simd_recv_buf[i        ][comm_off];
-	  auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset];
+	  auto sp = &u_simd_send_buf[nbr_plane][comm_off];
 	  if(nbr_proc){
@@ -1253,9 +1259,12 @@ public:
 	  }
 	}
-	AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
+	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
 	u_comm_offset     +=buffer_size;
      }
    }
    return 0;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -16,6 +16,7 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
 cudaStream_t cpuStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -98,6 +99,7 @@ void acceleratorInit(void)
  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
  cudaStreamCreate(&cpuStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@@ -112,6 +114,7 @@ void acceleratorInit(void)
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
 hipStream_t cpuStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -180,6 +183,7 @@ void acceleratorInit(void)
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
  hipStreamCreate(&cpuStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -107,6 +107,7 @@ void     acceleratorInit(void);
 extern int acceleratorAbortOnGpuError;
 extern cudaStream_t copyStream;
 extern cudaStream_t cpuStream;
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -134,7 +135,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
+    LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda);	\
  }
 #define accelerator_for6dNB(iter1, num1,				\
@@ -153,7 +154,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_blocks (num1,num2,num3);					\
    dim3 cu_threads(num4,num5,num6);					\
-    Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
+    Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
  }
 template<typename lambda>  __global__
@@ -189,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
 #define accelerator_barrier(dummy)					\
  {									\
-    cudaDeviceSynchronize();						\
+    cudaStreamSynchronize(cpuStream);					\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
      printf("accelerator_barrier(): Cuda error %s \n",			\
@@ -339,6 +340,7 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator_inline __host__ __device__ inline
 extern hipStream_t copyStream;
 extern hipStream_t cpuStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -360,11 +362,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-            0,0,						\
+			 0,cpuStream,					\
 			 num1,num2,nsimd, lambda);			\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
-            0,0,						\
+			 0,cpuStream,					\
 			 num1,num2,nsimd, lambda);			\
    } \
  }
@@ -398,7 +400,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 #define accelerator_barrier(dummy)				\
  {								\
-    hipDeviceSynchronize();					\
+    hipStreamSynchronize(cpuStream);					\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
--- a/HMC/Mobius2p1f_DD_RHMC_96I.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I.cc
@@ -0,0 +1,419 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps =  6;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
  HMCparams.Trajectories     = 1;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_DDHMC_lat";
  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
  //  Real beta         = 2.31;
  //  Real light_mass   = 5.4e-4;
  Real beta         = 2.13;
  Real light_mass   = 7.8e-4;
  Real strange_mass = 0.02132;
  Real pv_mass      = 1.0;
  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  // FIXME:
  // Same in MC and MD
  // Need to mix precision too
  OneFlavourRationalParams SFRp; // Strange
  SFRp.lo       = 4.0e-3;
  SFRp.hi       = 90.0;
  SFRp.MaxIter  = 60000;
  SFRp.tolerance= 1.0e-8;
  SFRp.mdtolerance= 1.0e-4;
  SFRp.degree   = 12;
  SFRp.precision= 50;
  SFRp.BoundsCheckFreq=0;
  OneFlavourRationalParams OFRp; // Up/down
  OFRp.lo       = 2.0e-5;
  OFRp.hi       = 90.0;
  OFRp.MaxIter  = 60000;
  OFRp.tolerance= 1.0e-7;
  OFRp.mdtolerance= 1.0e-4;
  //  OFRp.degree   = 20; converges
  //  OFRp.degree   = 16;
  OFRp.degree   = 12;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  ////////////////////////////////////////////////////////////////
  // Domain decomposed
  ////////////////////////////////////////////////////////////////
  Coordinate latt4  = GridPtr->GlobalDimensions();
  Coordinate mpi    = GridPtr->ProcessorGrid();
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  Coordinate CommDim(Nd);
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  Coordinate NonDirichlet(Nd+1,0);
  Coordinate Dirichlet(Nd+1,0);
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  Coordinate Block4(Nd);
  //  Block4[0] = Dirichlet[1];
  //  Block4[1] = Dirichlet[2];
  //  Block4[2] = Dirichlet[3];
  Block4[0] = 0;
  Block4[1] = 0;
  Block4[2] = 0;
  Block4[3] = Dirichlet[4];
  int Width=3;
  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4,Width));
  //////////////////////////
  // Fermion Grid
  //////////////////////////
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  Params.dirichlet=NonDirichlet;
  FermionAction::ImplParams ParamsDir(boundary);
  ParamsDir.dirichlet=Dirichlet;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
  double StoppingCondition = 1e-8;
  double MDStoppingCondition = 1e-6;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(4);
  ActionLevel<HMCWrapper::Field> Level3(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir);
  FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, ParamsDir);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
  Level1.push_back(&StrangePseudoFermionBdy);
  Level2.push_back(&StrangePseudoFermionLocal);
  Level1.push_back(&StrangePseudoFermionPVBdy);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  std::vector<int> dirichlet_den;
  std::vector<int> dirichlet_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
  }
  for(int h=0;h<n_hasenbusch;h++){
    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
  }
  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
    std::cout << "/ det D("<<light_num[h]<<")";
    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
    std::cout << std::endl;
    FermionAction::ImplParams ParamsNum(boundary);
    FermionAction::ImplParams ParamsDen(boundary);
    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
    else                      ParamsNum.dirichlet = NonDirichlet;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    if(h!=0) {
      Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
    } else {
      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
    }
  }
  int nquo=Quotients.size();
  Level1.push_back(Bdys[0]);
  Level1.push_back(Bdys[1]);
  for(int h=0;h<nquo-1;h++){
    Level2.push_back(Quotients[h]);
  }
  Level2.push_back(Quotients[nquo-1]);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  if(1){
    // TODO:
    // i)  Break high bound, how rapidly does it break? Tune this test.
    // ii) Break low bound, how rapidly?
    // iii) Run lanczos
    // iv)  Have CG return spectral range estimate
    FermionField vec(StrangeOp.FermionRedBlackGrid());
    FermionField res(StrangeOp.FermionRedBlackGrid());
    vec = 1; // Fill with any old junk
    std::cout << "Bounds check on strange operator mass "<< StrangeOp.Mass()<<std::endl;
    SchurDifferentiableOperator<FermionImplPolicy> SdagS(StrangeOp);
    HighBoundCheck(SdagS,vec,SFRp.hi);
    ChebyBoundsCheck(SdagS,vec,SFRp.lo,SFRp.hi);
    std::cout << "Strange inversion"<<std::endl;
    res=Zero();
    //    MDCG(SdagS,vec,res);
    std::cout << "Bounds check on light quark operator mass "<< Denominators[0]->Mass() <<std::endl;
    SchurDifferentiableOperator<FermionImplPolicy> UdagU(*Denominators[0]);
    HighBoundCheck(UdagU,vec,OFRp.hi);
    ChebyBoundsCheck(UdagU,vec,OFRp.lo,OFRp.hi);
    std::cout << "light inversion"<<std::endl;
    res=Zero();
    //    MDCG(UdagU,vec,res);
    std::cout << "Bounds check on strange dirichlet operator mass "<< StrangeOpDir.Mass()<<std::endl;
    SchurDifferentiableOperator<FermionImplPolicy> SddagSd(StrangeOpDir);
    HighBoundCheck(SddagSd,vec,OFRp.hi);
    ChebyBoundsCheck(SddagSd,vec,OFRp.lo,OFRp.hi);
    std::cout << "strange dirichlet inversion"<<std::endl;
    res=Zero();
    //    MDCG(SddagSd,vec,res);
    std::cout << "Bounds check on light dirichlet operator mass "<< Numerators[0]->Mass()<<std::endl;
    SchurDifferentiableOperator<FermionImplPolicy> UddagUd(*Numerators[0]);
    HighBoundCheck(UddagUd,vec,OFRp.hi);
    ChebyBoundsCheck(UddagUd,vec,OFRp.lo,OFRp.hi);
    std::cout << "light dirichlet inversion"<<std::endl;
    res=Zero();
    //MDCG(UddagUd,vec,res);
    auto grid4= GridPtr;
    auto rbgrid4= GridRBPtr;
    auto rbgrid = StrangeOp.FermionRedBlackGrid();
    auto grid = StrangeOp.FermionGrid();
    if(1){
    const int Nstop = 5;
    const int Nk = 20;
    const int Np = 20;
    const int Nm = Nk+Np;
    const int MaxIt= 10000;
    int Nconv;
    RealD resid = 1.0e-5;
    if(0)
    {
      int order = 501;
      RealD bound = 5.0e-4;
      std::cout << GridLogMessage << " Lanczos for dirichlet bound " << bound<<" order "<< order<<std::endl;
      Chebyshev<FermionField> Cheby(bound,90.,order);
      FunctionHermOp<FermionField> OpCheby(Cheby,UddagUd);
      PlainHermOp<FermionField> Op     (UddagUd);
      ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
      std::vector<RealD>          eval(Nm);
      std::vector<FermionField> evec(Nm,rbgrid);
      FermionField    src(rbgrid);src = 1.0;
      IRL.calc(eval,evec,src,Nconv);
      FermionField tmp(rbgrid);
      FermionField ftmp(grid);
      FermionField ftmp4(grid4);
      for(int ev=0;ev<evec.size();ev++){
 	Gamma GT(Gamma::Algebra::GammaT);
 	std::cout << " evec " << ev << std::endl;
 	tmp = evec[ev] + GT*evec[ev];
 	DumpSliceNorm(" 1+gammaT ",tmp,Nd);
 	tmp = evec[ev] - GT*evec[ev];
 	DumpSliceNorm(" 1-gammaT ",tmp,Nd);
      }
      for(int e=0;e<10;e++){
 	std::cout << " Dirichlet evec "<<e<<std::endl;
 	tmp = evec[e];
 	for(int s=0;s<Ls;s++){
 	  ftmp=Zero();
 	  setCheckerboard(ftmp,tmp);
 	  ExtractSlice(ftmp4,ftmp,s,0);
 	  std::cout << "s-slice "<<s<< " evec[0] " << std::endl;
 	  DumpSliceNorm(" s-slice ",ftmp4,Nd-1);
 	}
      }
    }
    if(1)
    {
      int order = 2001;
      RealD bound = 6.0e-5;
      std::cout << GridLogMessage << " Lanczos for full operator  bound " << bound<<" order "<< order<<std::endl;
      Chebyshev<FermionField> Cheby(bound,90.,order);
      FunctionHermOp<FermionField> OpCheby(Cheby,UdagU);
      PlainHermOp<FermionField> Op     (UdagU);
      ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
      std::vector<RealD>          eval(Nm);
      std::vector<FermionField> evec(Nm,rbgrid);
      FermionField    src(rbgrid); src = 1.0;
      IRL.calc(eval,evec,src,Nconv);
      FermionField tmp(rbgrid);
      FermionField ftmp(grid);
      FermionField ftmp4(grid4);
      for(int e=0;e<evec.size();e++){
 	std::cout << " Full evec "<<e<<std::endl;
 	tmp = evec[e];
 	for(int s=0;s<Ls;s++){
 	  ftmp=Zero();
 	  setCheckerboard(ftmp,tmp);
 	  ExtractSlice(ftmp4,ftmp,s,0);
 	  std::cout << "s-slice "<<s<< " evec[0] " << std::endl;
 	  DumpSliceNorm(" s-slice ",ftmp4,Nd-1);
 	}
      }
    }
    Grid_finalize();
    std::cout << " All done "<<std::endl;
    exit(EXIT_SUCCESS);
    }
  }
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
@@ -0,0 +1,444 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
 template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    using OperatorFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
      /* Debugging instances of objects; references are stored
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
      */
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
      // Assumption made in code to extract gauge field
      // We could avoid storing LinopD reference alltogether ?
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      ////////////////////////////////////////////////////////////////////////////////////
      // Must snarf a single precision copy of the gauge field in Linop_d argument
      ////////////////////////////////////////////////////////////////////////////////////
      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
      GridBase * GridPtrF = SinglePrecGrid4;
      GridBase * GridPtrD = FermOpD.Umu.Grid();
      GaugeFieldF     U_f  (GridPtrF);
      GaugeLinkFieldF Umu_f(GridPtrF);
      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
      ////////////////////////////////////////////////////////////////////////////////////
      // Moving this to a Clone method of fermion operator would allow to duplicate the 
      // physics parameters and decrease gauge field copies
      ////////////////////////////////////////////////////////////////////////////////////
      GaugeLinkFieldD Umu_d(GridPtrD);
      for(int mu=0;mu<Nd*2;mu++){ 
 	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
 	precisionChange(Umu_f,Umu_d);
 	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
      }
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 NAMESPACE_END(Grid);
 int main(int argc, char **argv) {
  using namespace Grid;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef WilsonImplF FermionImplPolicyF;
  typedef MobiusFermionR FermionAction;
  typedef MobiusFermionF FermionActionF;
  typedef typename FermionAction::FermionField FermionField;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps =  4;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 1077;
  HMCparams.Trajectories     = 1;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_DDHMC_lat";
  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 12;
  RealD M5  = 1.8;
  RealD b   = 1.5;
  RealD c   = 0.5;
  Real beta         = 2.31;
  //  Real light_mass   = 5.4e-4;
  Real light_mass   = 7.8e-4;
  Real strange_mass = 0.02132;
  Real pv_mass      = 1.0;
  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  // FIXME:
  // Same in MC and MD
  // Need to mix precision too
  OneFlavourRationalParams SFRp; // Strange
  SFRp.lo       = 4.0e-3;
  SFRp.hi       = 90.0;
  SFRp.MaxIter  = 60000;
  SFRp.tolerance= 1.0e-8;
  SFRp.mdtolerance= 1.0e-6;
  SFRp.degree   = 12;
  SFRp.precision= 50;
  SFRp.BoundsCheckFreq=0;
  OneFlavourRationalParams OFRp; // Up/down
  OFRp.lo       = 2.0e-5;
  OFRp.hi       = 90.0;
  OFRp.MaxIter  = 60000;
  OFRp.tolerance= 1.0e-8;
  OFRp.mdtolerance= 1.0e-6;
  //  OFRp.degree   = 20; converges
  //  OFRp.degree   = 16;
  OFRp.degree   = 12;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
  ////////////////////////////////////////////////////////////////
  // Domain decomposed
  ////////////////////////////////////////////////////////////////
  Coordinate latt4  = GridPtr->GlobalDimensions();
  Coordinate mpi    = GridPtr->ProcessorGrid();
  Coordinate shm;
  GlobalSharedMemory::GetShmDims(mpi,shm);
  Coordinate CommDim(Nd);
  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
  Coordinate NonDirichlet(Nd+1,0);
  Coordinate Dirichlet(Nd+1,0);
  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
  Coordinate Block4(Nd);
  Block4[0] = Dirichlet[1];
  Block4[1] = Dirichlet[2];
  Block4[2] = Dirichlet[3];
  Block4[3] = Dirichlet[4];
  int Width=3;
  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4,Width));
  //////////////////////////
  // Fermion Grids
  //////////////////////////
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi);
  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeFieldF UF(GridPtrF);
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
  TheHMC.initializeGaugeFieldAndRNGs(U);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  Params.dirichlet=NonDirichlet;
  FermionAction::ImplParams ParamsDir(boundary);
  ParamsDir.dirichlet=Dirichlet;
  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
  double StoppingCondition = 1e-10;
  double MDStoppingCondition = 1e-7;
  double MDStoppingConditionLoose = 1e-6;
  double MaxCGIterations = 300000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(4);
  ActionLevel<HMCWrapper::Field> Level3(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir);
  FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, ParamsDir);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
  Level1.push_back(&StrangePseudoFermionBdy);
  Level2.push_back(&StrangePseudoFermionLocal);
  Level1.push_back(&StrangePseudoFermionPVBdy);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  std::vector<int> dirichlet_den;
  std::vector<int> dirichlet_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
  }
  for(int h=0;h<n_hasenbusch;h++){
    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
  }
  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<FermionActionF *> DenominatorsF;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
  std::vector<MxPCG *> ActionMPCG;
  std::vector<MxPCG *> MPCG;
  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  std::vector<LinearOperatorD *> LinOpD;
  std::vector<LinearOperatorF *> LinOpF; 
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage
 	      << " 2f quotient Action ";
    std::cout << "det D("<<light_den[h]<<")";
    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
    std::cout << "/ det D("<<light_num[h]<<")";
    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
    std::cout << std::endl;
    FermionAction::ImplParams ParamsNum(boundary);
    FermionAction::ImplParams ParamsDen(boundary);
    FermionActionF::ImplParams ParamsDenF(boundary);
    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
    else                      ParamsNum.dirichlet = NonDirichlet;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
    ParamsDenF.dirichlet = ParamsDen.dirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
    double conv  = MDStoppingCondition;
    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
    const int MX_inner = 5000;
    MPCG.push_back(new MxPCG(conv,
 			     MX_inner,
 			     MaxCGIterations,
 			     GridPtrF,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
    ActionMPCG.push_back(new MxPCG(StoppingCondition,
 				   MX_inner,
 				   MaxCGIterations,
 				   GridPtrF,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
    if(h!=0) {
      //      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
    } else {
      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
    }
  }
  int nquo=Quotients.size();
  Level1.push_back(Bdys[0]);
  Level1.push_back(Bdys[1]);
  for(int h=0;h<nquo-1;h++){
    Level2.push_back(Quotients[h]);
  }
  Level2.push_back(Quotients[nquo-1]);
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/RNGstate.cc
+++ b/HMC/RNGstate.cc
@@ -0,0 +1,53 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file:
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv)
 {
  using namespace Grid;
  Grid_init(&argc, &argv);
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate simd   = GridDefaultSimd(Nd,vComplexD::Nsimd());
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,simd,mpi);
  GridSerialRNG   sRNG;         sRNG.SeedUniqueString(std::string("The Serial RNG"));
  GridParallelRNG pRNG(UGrid);  pRNG.SeedUniqueString(std::string("The 4D RNG"));
  std::string rngfile("ckpoint_rng.0");
  NerscIO::writeRNGState(sRNG, pRNG, rngfile);
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -191,9 +191,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      Dw.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -249,8 +249,9 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionF::ImplParams p;
-  Dw.DirichletBlock(Dirichlet);
+  p.dirichlet=Dirichlet;
  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
  Dw.ImportGauge(Umu);
  int ncall =300;
@@ -261,9 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      Dw.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -81,8 +81,8 @@ int main (int argc, char ** argv)
    Vector<Coeff_t> diag = Dw.bs;
    Vector<Coeff_t> upper= Dw.cs;
    Vector<Coeff_t> lower= Dw.cs;
-    upper[Ls-1]=-Dw.mass*upper[Ls-1];
+    upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
-    lower[0]   =-Dw.mass*lower[0];
+    lower[0]   =-Dw.mass_plus*lower[0];
    LatticeFermion r_eo(FGrid);
    LatticeFermion src_e (FrbGrid);
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -44,6 +44,13 @@ void bench_wilson (
 		   double const     volume,
 		   int const           dag );
 void bench_wilson_eo (
       LatticeFermion &    src,
       LatticeFermion & result,
       WilsonFermionR &     Dw,
       double const     volume,
       int const           dag );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -110,8 +117,8 @@ int main (int argc, char ** argv)
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << "\t";
    // EO
-	  bench_wilson(src,result,Dw,volume,DaggerNo);
+    bench_wilson_eo(src_o,result_e,Dw,volume,DaggerNo);
-	  bench_wilson(src,result,Dw,volume,DaggerYes);
+    bench_wilson_eo(src_o,result_e,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
 	}
    }
--- a/configure.ac
+++ b/configure.ac
@@ -159,7 +159,7 @@ case ${ac_ZMOBIUS} in
 esac
 ############### Nc
 AC_ARG_ENABLE([Nc],
-    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
+    [AC_HELP_STRING([--enable-Nc=2|3|4|5], [enable number of colours])],
    [ac_Nc=${enable_Nc}], [ac_Nc=3])
 case ${ac_Nc} in
@@ -394,11 +394,10 @@ case ${CXXTEST} in
    fi
    ;;
  hipcc)
 #    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
    CXXLD=${CXX}
    if test $ac_openmp = yes; then
-       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
+       CXXFLAGS="$CXXFLAGS -fopenmp"
    fi
    ;;
  dpcpp)
@@ -557,16 +556,19 @@ esac
 AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice],
              [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
 case ${ac_SETDEVICE} in
-    yes);;
+    yes)
-    no)
+	echo ENABLE SET DEVICE
    ;;
    *)
     AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] )
   	echo DISABLE SET DEVICE
    ;;
 esac
 #########################################################
 ######################  Shared memory intranode #########
 #########################################################
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no],
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no|none],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])
 case ${ac_SHM} in
@@ -586,7 +588,7 @@ case ${ac_SHM} in
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
     ;;
-     shmnone | no)
+     shmnone | no | none)
     AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
     ;;
--- a/examples/Example_Laplacian.cc
+++ b/examples/Example_Laplacian.cc
@@ -93,14 +93,14 @@ template<class Field> class FreeLaplacianStencil : public SparseMatrixBase<Field
 {
 public:
  typedef typename Field::vector_object siteObject;
-  typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
+  typedef CartesianStencil<siteObject, siteObject, SimpleStencilParams> StencilImpl;
  GridBase *grid;
  StencilImpl Stencil;
  SimpleCompressor<siteObject> Compressor;
  FreeLaplacianStencil(GridBase *_grid)
-    : Stencil    (_grid,6,Even,directions,displacements,0), grid(_grid)
+    : Stencil    (_grid,6,Even,directions,displacements,SimpleStencilParams()), grid(_grid)
  {  };
  virtual GridBase *Grid(void) { return grid; };
@@ -168,7 +168,8 @@ public:
  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
-  typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
+  typedef CartesianStencil<siteObject, siteObject,SimpleStencilParams> StencilImpl;
  SimpleStencilParams p;
  GridBase *grid;
  StencilImpl Stencil;
@@ -177,7 +178,7 @@ public:
  CovariantLaplacianStencil(GaugeField &Umu)
    :
      grid(Umu.Grid()),
-      Stencil    (grid,6,Even,directions,displacements,0),
+      Stencil    (grid,6,Even,directions,displacements,p),
      Uds(grid)
  {
    for (int mu = 0; mu < Nd; mu++) {
--- a/systems/Crusher/dwf4.slurm
+++ b/systems/Crusher/dwf4.slurm
@@ -7,21 +7,19 @@
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
-#SBATCH -n 4
+#SBATCH -n 2
-#SBATCH --exclusive
+#SBATCH --gpu-bind=map_gpu:0,1
 DIR=.
-module list
+source setup.sh
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
-#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+export OMP_NUM_THREADS=16
 export MPICH_SMP_SINGLE_COPY_MODE=NONE
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 export OMP_NUM_THREADS=4
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
-srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+srun --gpus-per-task 1 -N1 -n2  ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.16.32.64 --shm-mpi 1 --shm 2048 --comms-sequential --accelerator-threads 8
--- a/systems/Crusher/dwf8.slurm
+++ b/systems/Crusher/dwf8.slurm
@@ -6,10 +6,10 @@
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
-#SBATCH -N 8
+#SBATCH -N 1
-#SBATCH -n 64
+#SBATCH -n 8
-#SBATCH --exclusive
+##SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
-#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
+#SBATCH --gpu-bind=map_gpu:0,1,2,3,6,7,4,5
 DIR=.
 source setup.sh
@@ -17,25 +17,12 @@ source setup.sh
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
-export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
 #export MPICH_SMP_SINGLE_COPY_MODE=NONE
-export OMP_NUM_THREADS=1
+export OMP_NUM_THREADS=16
 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
-for vol in 64.64.64.256 64.64.64.128 32.32.32.256  32.32.32.128
+srun --gpus-per-task 1 -N1 -n8  ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.1 --shm-mpi 1 --shm 2048 --comms-sequential --accelerator-threads 8
 do
 PARAMS=" --accelerator-threads 8 --grid $vol --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
 echo $PARAMS
 srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.${vol}.8node.shm-mpi1
 done
 PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
 echo $PARAMS
 srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_ITT $PARAMS > itt.8node
 PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
 echo $PARAMS
 srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0
--- a/systems/Crusher/sourceme.sh
+++ b/systems/Crusher/sourceme.sh
@@ -1,6 +1,6 @@
 module load PrgEnv-gnu
 module load rocm/5.1.0
-module load cray-mpich/8.1.15
+module load cray-mpich/8.1.16
 module load gmp
 #module load cray-fftw
 module load craype-accel-amd-gfx90a
--- a/systems/Perlmutter/config-command
+++ b/systems/Perlmutter/config-command
@@ -1,9 +1,14 @@
 DIR=`pwd`
 PREFIX=$DIR/../Prequisites/install/
 ../../configure \
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-shm=nvlink \
    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
    --enable-setdevice \
    --disable-accelerator-cshift \
    --with-gmp=$PREFIX \
    --disable-fermion-reps \
    --disable-unified \
    --disable-gparity \
--- a/systems/Perlmutter/dwf4.slurm
+++ b/systems/Perlmutter/dwf4.slurm
@@ -1,24 +1,27 @@
 #!/bin/bash
-#SBATCH -A mp13
+#SBATCH -A m3886_g
 #SBATCH -C gpu
-#SBATCH -q regular
+#SBATCH -q debug
 #SBATCH -t 0:20:00
 #SBATCH -n 16
 #SBATCH --ntasks-per-node=4
 #SBATCH -c 32
-#SBATCH --exclusive
+#SBATCH -N 1
 #SBATCH -n 4
 #SBATCH --ntasks-per-node=4
 #SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=map_gpu:0,1,2,3
+#SBATCH --exclusive
 #SBATCH --gpu-bind=none
 export SLURM_CPU_BIND="cores"
 export MPICH_RDMA_ENABLED_CUDA=1
 export MPICH_GPU_SUPPORT_ENABLED=1
-srun ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.2  --accelerator-threads 8 > comms.4node
+export MPICH_RDMA_ENABLED_CUDA=1
 export MPICH_GPU_IPC_ENABLED=1
 export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
 export MPICH_GPU_NO_ASYNC_MEMCPY=0
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
-OPT="--comms-overlap --comms-concurrent --shm-mpi 0"
+OPT="--comms-sequential --shm-mpi 1"
-srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt0
+VOL=64.64.64.64
-srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt0
+srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.1.1 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
 #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.1.1.4 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
 #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.8 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
 OPT="--comms-overlap --comms-concurrent --shm-mpi 1"
 srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt1
 srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt1
--- a/systems/Perlmutter/sourceme.sh
+++ b/systems/Perlmutter/sourceme.sh
@@ -1,4 +1,4 @@
 export CRAY_ACCEL_TARGET=nvidia80
-module load PrgEnv-gnu cpe-cuda cuda
+module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4
--- a/systems/Summit/config-command
+++ b/systems/Summit/config-command
@@ -2,11 +2,12 @@
 	      --enable-simd=GPU \
 	      --enable-gen-simd-width=32 \
 	      --enable-unified=no \
-	       --enable-shm=nvlink \
+	       --enable-shm=no \
 	       --disable-gparity \
-	       --enable-setdevice \
+	       --disable-setdevice \
 	       --disable-fermion-reps \
 	       --enable-accelerator=cuda \
 	       --enable-accelerator-cshift \
 	       --prefix /ccs/home/paboyle/prefix \
 	       CXX=nvcc \
 	       LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
--- a/systems/Summit/dwf16.lsf
+++ b/systems/Summit/dwf16.lsf
@@ -1,25 +1,39 @@
 #!/bin/bash
 #BSUB -P LGT104
-#BSUB -W 2:00
+#BSUB -W 0:20
 #BSUB -nnodes 16
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
-APP="./benchmarks/Benchmark_comms_host_device  --mpi 4.4.4.3 "
+DIR=.
-jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log
+source sourceme.sh
-APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log
-APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+VOLS=( 32.32.32.16 32.32.32.64 64.32.32.64 64.32.64.64 64.64.64.64 64.64.64.128  64.64.64.256  64.64.64.512 128.64.64.64.512)
-jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log
+MPI=( 1.1.1.1      1.1.1.4     2.1.1.4         2.1.2.4     2.2.2.4      2.2.2.8      2.2.2.16      2.2.2.32 4.4.2.32 )
 RANKS=(     1            4           8              16          32          64            128           256 1024)
 NODES=(     1            1           2               4           8           16            32            64  128)
 INTS=(      0            1           2               3           4            5             6             7    8)
 for i in 5
 do
    vol=${VOLS[$i]} 
    nodes=${NODES[$i]} 
    mpi=${MPI[$i]} 
    ranks=${RANKS[$i]} 
    JSRUN="jsrun --nrs $nodes -a4 -g4 -c42 -dpacked -b packed:10 --latency_priority gpu-cpu --smpiargs=-gpu"
    PARAMS=" --accelerator-threads 8 --grid $vol --mpi $mpi --comms-sequential --shm 2048 --shm-mpi 0"
    $JSRUN ./benchmarks/Benchmark_dwf_fp32 $PARAMS > run.v${vol}.n${nodes}.m${mpi}.seq.ker
    PARAMS=" --accelerator-threads 8 --grid $vol --mpi $mpi --comms-overlap --shm 2048 --shm-mpi 0"
    $JSRUN ./benchmarks/Benchmark_dwf_fp32 $PARAMS > run.v${vol}.n${nodes}.m${mpi}.over.ker
 done
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -147,7 +147,7 @@ int main (int argc, char ** argv)
  Complex p  = TensorRemove(Tp);
  std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;
-  Complex LinkTraceScale(1.0/vol/4.0/3.0);
+  Complex LinkTraceScale(1.0/vol/4.0/(Real)Nc);
  TComplex Tl = sum(LinkTrace);
  Complex l  = TensorRemove(Tl);
  std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;
@@ -157,8 +157,10 @@ int main (int argc, char ** argv)
  Complex ll= TensorRemove(TcP);
  std::cout<<GridLogMessage << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;
-  std::string clone2x3("./ckpoint_clone2x3.4000");
+  const string stNc   = to_string( Nc   ) ;
-  std::string clone3x3("./ckpoint_clone3x3.4000");
+  const string stNcM1 = to_string( Nc-1 ) ;
  std::string clone2x3("./ckpoint_clone"+stNcM1+"x"+stNc+".4000");
  std::string clone3x3("./ckpoint_clone"+stNc+"x"+stNc+".4000");
  NerscIO::writeConfiguration(Umu,clone3x3,0,precision32);
  NerscIO::writeConfiguration(Umu,clone2x3,1,precision32);
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -46,7 +46,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-  const int Ls=8;
+  const int Ls=12;
  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
@@ -94,13 +94,32 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  double t1,t2,flops;
  int iters;
  for(int i=0;i<100;i++){
    result_o = Zero();
    t1=usecond();
    mCG(src_o,result_o);
-
+    t2=usecond();
    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
    flops = 1320.0*2*FGrid->gSites()*iters;
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
  for(int i=0;i<100;i++){
    result_o_2 = Zero();
    t1=usecond();
    CG(HermOpEO,src_o,result_o_2);
    t2=usecond();
    iters = CG.IterationsToComplete;
    flops = 1320.0*2*FGrid->gSites()*iters;
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
  }
-  MemoryManager::Print();
+  //  MemoryManager::Print();
  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 ;
 int main(int argc, char ** argv) {
  Grid_init(&argc, &argv);
@@ -80,7 +79,8 @@ int main(int argc, char ** argv) {
    Foo=lex;
  }
-  typedef CartesianStencil<vobj,vobj,int> Stencil;
+  typedef CartesianStencil<vobj,vobj,SimpleStencilParams> Stencil;
  SimpleStencilParams p;
    for(int dir=0;dir<4;dir++){
      for(int disp=0;disp<Fine._fdimensions[dir];disp++){
@@ -90,7 +90,7 @@ int main(int argc, char ** argv) {
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);
-	Stencil myStencil(&Fine,npoint,0,directions,displacements,0);
+	Stencil myStencil(&Fine,npoint,0,directions,displacements,p);
 	Coordinate ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
 	  Fine.oCoorFromOindex(ocoor,o);
@@ -183,8 +183,8 @@ int main(int argc, char ** argv) {
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);
-	Stencil EStencil(&rbFine,npoint,Even,directions,displacements,0);
+	Stencil EStencil(&rbFine,npoint,Even,directions,displacements,p);
-	Stencil OStencil(&rbFine,npoint,Odd,directions,displacements,0);
+	Stencil OStencil(&rbFine,npoint,Odd,directions,displacements,p);
 	Coordinate ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
--- a/tests/core/Test_compact_wilson_clover_speedup.cc
+++ b/tests/core/Test_compact_wilson_clover_speedup.cc
@@ -117,8 +117,8 @@ void runBenchmark(int* argc, char*** argv) {
  // type definitions
  typedef WilsonImpl<vCoeff_t, FundamentalRepresentation, CoeffReal> WImpl;
-  typedef WilsonCloverFermion<WImpl> WilsonCloverOperator;
+  typedef WilsonCloverFermion<WImpl, CloverHelpers<WImpl>> WilsonCloverOperator;
-  typedef CompactWilsonCloverFermion<WImpl> CompactWilsonCloverOperator;
+  typedef CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>> CompactWilsonCloverOperator;
  typedef typename WilsonCloverOperator::FermionField Fermion;
  typedef typename WilsonCloverOperator::GaugeField Gauge;
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -60,6 +61,10 @@ int main(int argc, char** argv) {
            << std::endl;
  SU2::printGenerators();
  std::cout << "Dimension of adjoint representation: "<< SU2Adjoint::Dimension << std::endl;
  // guard as this code fails to compile for Nc != 3
 #if (Nc == 3)
  SU2Adjoint::printGenerators();
  SU2::testGenerators();
  SU2Adjoint::testGenerators();
@@ -111,8 +116,6 @@ int main(int argc, char** argv) {
  // AdjointRepresentation has the predefined number of colours Nc
  //  Representations<FundamentalRepresentation, AdjointRepresentation, TwoIndexSymmetricRepresentation> RepresentationTypes(grid);  
  LatticeGaugeField U(grid), V(grid);
  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U);
  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V);
@@ -181,7 +184,6 @@ int main(int argc, char** argv) {
  std::cout << GridLogMessage << "orthogonality check 2: " << norm2(Ucheck)
 	    << std::endl;
  // Construct the fundamental matrix in the group
  SU3::LatticeMatrix Af(grid);
  SU3::FundamentalLieAlgebraMatrix(h_adj,Af);
@@ -198,7 +200,6 @@ int main(int argc, char** argv) {
  std::cout << GridLogMessage << "unitarity check 2: " << norm2(UnitCheck)
 	    << std::endl;
  // Tranform to the adjoint representation
  U = Zero(); // fill this with only one direction
  pokeLorentz(U,Ufund,0); // the representation transf acts on full gauge fields
@@ -210,11 +211,7 @@ int main(int argc, char** argv) {
  typename AdjointRep<Nc>::LatticeMatrix Diff_check_mat = Ur0 - Uadj;
  std::cout << GridLogMessage << "Projections structure check group difference : " << norm2(Diff_check_mat) << std::endl;
  // TwoIndexRep tests
  std::cout << GridLogMessage << "*********************************************"
 	    << std::endl;
  std::cout << GridLogMessage << "*********************************************"
@@ -253,8 +250,6 @@ int main(int argc, char** argv) {
  std::cout << GridLogMessage << "Test of Two Index anti-Symmetric Generators: "<< SU2TwoIndexAntiSymm::Dimension << std::endl;
  SU2TwoIndexAntiSymm::testGenerators();
  std::cout << GridLogMessage << "*********************************************"
 	    << std::endl;
  std::cout << GridLogMessage << "Test for the Two Index Symmetric projectors"
@@ -357,7 +352,6 @@ int main(int argc, char** argv) {
  SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
  std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl;
  // Exponentiate
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix U2iS(grid);
  U2iS  = expMat(Ar_sym, 1.0, 16);
@@ -376,8 +370,6 @@ int main(int argc, char** argv) {
  std::cout << GridLogMessage << "orthogonality check 2: " << norm2(Ucheck2iS)
 	    << std::endl;
  // Construct the fundamental matrix in the group
  SU3::LatticeMatrix Af_sym(grid);
  SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
@@ -403,9 +395,6 @@ int main(int argc, char** argv) {
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Diff_check_mat2 = Ur02 - U2iS;
  std::cout << GridLogMessage << "Projections structure check group difference (Two Index Symmetric): " << norm2(Diff_check_mat2) << std::endl;
  if (TwoIndexRep<Nc, AntiSymmetric >::Dimension != 1){
    std::cout << GridLogMessage << "*********************************************"
@@ -519,14 +508,7 @@ int main(int argc, char** argv) {
 	      << std::endl;
  }
-
+#endif
  Grid_finalize();
 }
--- a/tests/core/Test_reunitarise.cc
+++ b/tests/core/Test_reunitarise.cc
@@ -122,13 +122,14 @@ int main (int argc, char ** argv)
  std::cout << "Determinant defect before projection " <<norm2(detU)<<std::endl;
  tmp = U*adj(U) - ident;
  std::cout << "Unitarity check before projection    " << norm2(tmp)<<std::endl; 
-  
+#if (Nc == 3)
  ProjectSU3(U);
  detU= Determinant(U) ;
  detU= detU -1.0;
  std::cout << "Determinant ProjectSU3 defect " <<norm2(detU)<<std::endl;
  tmp = U*adj(U) - ident;
  std::cout << "Unitarity check after projection    " << norm2(tmp)<<std::endl; 
 #endif
  ProjectSUn(UU);
  detUU= Determinant(UU);
--- a/tests/core/Test_wilson_clover.cc
+++ b/tests/core/Test_wilson_clover.cc
@@ -2,11 +2,12 @@
    Grid physics library, www.github.com/paboyle/Grid 
-    Source file: ./benchmarks/Benchmark_wilson.cc
+    Source file: ./tests/core/Test_wilson_clover.cc
    Copyright (C) 2015
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
            Fabian Joswig <fabian.joswig@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -67,8 +68,6 @@ int main(int argc, char **argv)
  tmp = Zero();
  FermionField err(&Grid);
  err = Zero();
  FermionField err2(&Grid);
  err2 = Zero();
  FermionField phi(&Grid);
  random(pRNG, phi);
  FermionField chi(&Grid);
@@ -77,6 +76,8 @@ int main(int argc, char **argv)
  SU<Nc>::HotConfiguration(pRNG, Umu);
  std::vector<LatticeColourMatrix> U(4, &Grid);
  double tolerance = 1e-4;
  double volume = 1;
  for (int mu = 0; mu < Nd; mu++)
  {
@@ -88,7 +89,7 @@ int main(int argc, char **argv)
  RealD csw_t = 1.0;
  WilsonCloverFermionR Dwc(Umu, Grid, RBGrid, mass, csw_r, csw_t, anis, params);
-  //Dwc.ImportGauge(Umu); // not necessary, included in the constructor
+  CompactWilsonCloverFermionR Dwc_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params);
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing that Deo + Doe = Dunprec " << std::endl;
@@ -112,7 +113,24 @@ int main(int argc, char **argv)
  setCheckerboard(r_eo, r_e);
  err = ref - r_eo;
-  std::cout << GridLogMessage << "EO norm diff   " << norm2(err) << " " << norm2(ref) << " " << norm2(r_eo) << std::endl;
+  std::cout << GridLogMessage << "EO norm diff\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.Meooe(src_e, r_o);
  std::cout << GridLogMessage << "Applied Meo" << std::endl;
  Dwc_compact.Meooe(src_o, r_e);
  std::cout << GridLogMessage << "Applied Moe" << std::endl;
  Dwc_compact.Dhop(src, ref, DaggerNo);
  setCheckerboard(r_eo, r_o);
  setCheckerboard(r_eo, r_e);
  err = ref - r_eo;
  std::cout << GridLogMessage << "EO norm diff compact\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test Ddagger is the dagger of D by requiring                " << std::endl;
@@ -152,6 +170,22 @@ int main(int argc, char **argv)
  std::cout << GridLogMessage << "pDce - conj(cDpo) " << pDce - conj(cDpo) << std::endl;
  std::cout << GridLogMessage << "pDco - conj(cDpe) " << pDco - conj(cDpe) << std::endl;
  Dwc_compact.Meooe(chi_e, dchi_o);
  Dwc_compact.Meooe(chi_o, dchi_e);
  Dwc_compact.MeooeDag(phi_e, dphi_o);
  Dwc_compact.MeooeDag(phi_o, dphi_e);
  pDce = innerProduct(phi_e, dchi_e);
  pDco = innerProduct(phi_o, dchi_o);
  cDpe = innerProduct(chi_e, dphi_e);
  cDpo = innerProduct(chi_o, dphi_o);
  std::cout << GridLogMessage << "e compact " << pDce << " " << cDpe << std::endl;
  std::cout << GridLogMessage << "o compact " << pDco << " " << cDpo << std::endl;
  std::cout << GridLogMessage << "pDce - conj(cDpo) compact " << pDce - conj(cDpo) << std::endl;
  std::cout << GridLogMessage << "pDco - conj(cDpe) compact " << pDco - conj(cDpe) << std::endl;
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test MeeInv Mee = 1   (if csw!=0)                           " << std::endl;
  std::cout << GridLogMessage << "==============================================================" << std::endl;
@@ -170,6 +204,20 @@ int main(int argc, char **argv)
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.Mooee(chi_e, src_e);
  Dwc_compact.MooeeInv(src_e, phi_e);
  Dwc_compact.Mooee(chi_o, src_o);
  Dwc_compact.MooeeInv(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test MeeDag MeeInvDag = 1    (if csw!=0)                    " << std::endl;
@@ -189,6 +237,20 @@ int main(int argc, char **argv)
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.MooeeDag(chi_e, src_e);
  Dwc_compact.MooeeInvDag(src_e, phi_e);
  Dwc_compact.MooeeDag(chi_o, src_o);
  Dwc_compact.MooeeInvDag(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test MeeInv MeeDag = 1      (if csw!=0)                     " << std::endl;
@@ -208,6 +270,20 @@ int main(int argc, char **argv)
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.MooeeDag(chi_e, src_e);
  Dwc_compact.MooeeInv(src_e, phi_e);
  Dwc_compact.MooeeDag(chi_o, src_o);
  Dwc_compact.MooeeInv(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "================================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing gauge covariance Clover term with EO preconditioning  " << std::endl;
@@ -249,7 +325,7 @@ int main(int argc, char **argv)
  /////////////////
  WilsonCloverFermionR Dwc_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, anis, params);
-  Dwc_prime.ImportGauge(U_prime);
+  CompactWilsonCloverFermionR Dwc_compact_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params);
  tmp = Omega * src;
  pickCheckerboard(Even, src_e, tmp);
@@ -263,6 +339,36 @@ int main(int argc, char **argv)
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  tmp = Zero();
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  Dwc_compact.Mooee(src_e, chi_e);
  Dwc_compact.Mooee(src_o, chi_o);
  setCheckerboard(chi, chi_e);
  setCheckerboard(chi, chi_o);
  setCheckerboard(src, src_e);
  setCheckerboard(src, src_o);
  tmp = Omega * src;
  pickCheckerboard(Even, src_e, tmp);
  pickCheckerboard(Odd, src_o, tmp);
  Dwc_compact_prime.Mooee(src_e, phi_e);
  Dwc_compact_prime.Mooee(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "=================================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing gauge covariance Clover term w/o EO preconditioning  " << std::endl;
@@ -272,7 +378,6 @@ int main(int argc, char **argv)
  phi = Zero();
  WilsonFermionR Dw(Umu, Grid, RBGrid, mass, params);
  Dw.ImportGauge(Umu);
  Dw.M(src, result);
  Dwc.M(src, chi);
@@ -280,13 +385,24 @@ int main(int argc, char **argv)
  Dwc_prime.M(Omega * src, phi);
  WilsonFermionR Dw_prime(U_prime, Grid, RBGrid, mass, params);
  Dw_prime.ImportGauge(U_prime);
  Dw_prime.M(Omega * src, result2);
-  err = chi - adj(Omega) * phi;
+  err = result - adj(Omega) * result2;
  err2 = result - adj(Omega) * result2;
  std::cout << GridLogMessage << "norm diff Wilson              " << norm2(err) << std::endl;
-  std::cout << GridLogMessage << "norm diff WilsonClover  " << norm2(err2) << std::endl;
+  assert(fabs(norm2(err)) < tolerance);
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff WilsonClover        " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  Dwc_compact.M(src, chi);
  Dwc_compact_prime.M(Omega * src, phi);
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff CompactWilsonClover " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing Mooee(csw=0) Clover to reproduce Mooee Wilson   " << std::endl;
@@ -296,7 +412,6 @@ int main(int argc, char **argv)
  phi = Zero();
  err = Zero();
  WilsonCloverFermionR Dwc_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, anis, params); //  <-- Notice: csw=0
  Dwc_csw0.ImportGauge(Umu);
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
@@ -317,6 +432,33 @@ int main(int argc, char **argv)
  err = chi - phi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  err = Zero();
  CompactWilsonCloverFermionR Dwc_compact_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, 1.0, anis, params); //  <-- Notice: csw=0
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  Dw.Mooee(src_e, chi_e);
  Dw.Mooee(src_o, chi_o);
  Dwc_compact_csw0.Mooee(src_e, phi_e);
  Dwc_compact_csw0.Mooee(src_o, phi_o);
  setCheckerboard(chi, chi_e);
  setCheckerboard(chi, chi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  setCheckerboard(src, src_e);
  setCheckerboard(src, src_o);
  err = chi - phi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing EO operator is equal to the unprec              " << std::endl;
@@ -351,6 +493,38 @@ int main(int argc, char **argv)
  std::cout << GridLogMessage << "ref (unpreconditioned operator) diff         : " << norm2(ref) << std::endl;
  std::cout << GridLogMessage << "phi (EO decomposition)          diff         : " << norm2(phi) << std::endl;
  std::cout << GridLogMessage << "norm diff                                    : " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  err = Zero();
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  // M phi = (Mooee src_e + Meooe src_o , Meooe src_e + Mooee src_o)
  Dwc_compact.M(src, ref); // Reference result from the unpreconditioned operator
  // EO matrix
  Dwc_compact.Mooee(src_e, chi_e); 
  Dwc_compact.Mooee(src_o, chi_o);
  Dwc_compact.Meooe(src_o, phi_e);
  Dwc_compact.Meooe(src_e, phi_o);
  phi_o += chi_o;
  phi_e += chi_e;
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = ref - phi;
  std::cout << GridLogMessage << "ref (unpreconditioned operator) diff compact : " << norm2(ref) << std::endl;
  std::cout << GridLogMessage << "phi (EO decomposition)          diff compact : " << norm2(phi) << std::endl;
  std::cout << GridLogMessage << "norm diff compact                            : " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Grid_finalize();
 }
--- a/tests/core/Test_wilson_conserved_current.cc
+++ b/tests/core/Test_wilson_conserved_current.cc
@@ -0,0 +1,253 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_cayley_cg.cc
 Copyright (C) 2022
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 template<class What>
 void  TestConserved(What & Dw,
 		    LatticeGaugeField &Umu,
 		    GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		    GridParallelRNG *RNG4);
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
    Gamma::Algebra::Gamma5
  };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);
  std::vector<int> seeds4({1,2,3,4}); RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
  if( argc > 1 && argv[1][0] != '-' )
  {
    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
    FieldMetaData header;
    NerscIO::readConfiguration(Umu, header, argv[1]);
  }
  else
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::HotConfiguration(RNG4,Umu);
  }
  typename WilsonCloverFermionR::ImplParams params;
  WilsonAnisotropyCoefficients anis;
  RealD mass = 0.1;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  std::cout<<GridLogMessage <<"WilsonFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  WilsonFermionR Dw(Umu,*UGrid,*UrbGrid,mass,params);
  TestConserved<WilsonFermionR>(Dw,Umu,UGrid,UrbGrid,&RNG4);
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  std::cout<<GridLogMessage <<"WilsonCloverFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  WilsonCloverFermionR Dwc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, anis, params);
  TestConserved<WilsonCloverFermionR>(Dwc,Umu,UGrid,UrbGrid,&RNG4);
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  std::cout<<GridLogMessage <<"CompactWilsonCloverFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  CompactWilsonCloverFermionR Dwcc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, 1.0, anis, params);
  TestConserved<CompactWilsonCloverFermionR>(Dwcc,Umu,UGrid,UrbGrid,&RNG4);
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  std::cout<<GridLogMessage <<"WilsonExpCloverFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  WilsonExpCloverFermionR Dewc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, anis, params);
  TestConserved<WilsonExpCloverFermionR>(Dewc,Umu,UGrid,UrbGrid,&RNG4);
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  std::cout<<GridLogMessage <<"CompactWilsonExpCloverFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"=================================="<<std::endl;
  CompactWilsonExpCloverFermionR Dewcc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, 1.0, anis, params);
  TestConserved<CompactWilsonExpCloverFermionR>(Dewcc,Umu,UGrid,UrbGrid,&RNG4);
  Grid_finalize();
 }
 template<class Action>
 void  TestConserved(Action & Dw,
 		    LatticeGaugeField &Umu,
 		    GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		    GridParallelRNG *RNG4)
 {
  LatticePropagator phys_src(UGrid);
  LatticePropagator seqsrc(UGrid);
  LatticePropagator prop4(UGrid);
  LatticePropagator Vector_mu(UGrid);
  LatticeComplex    SV (UGrid);
  LatticeComplex    VV (UGrid);
  LatticePropagator seqprop(UGrid);
  SpinColourMatrix kronecker; kronecker=1.0;
  Coordinate coor({0,0,0,0});
  phys_src=Zero();
  pokeSite(kronecker,phys_src,coor);
  ConjugateGradient<LatticeFermion> CG(1.0e-16,100000);
  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> zpg;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      LatticeFermion src4  (UGrid);
      PropToFerm<Action>(src4,phys_src,s,c);
      LatticeFermion result4(UGrid); result4=Zero();
      schur(Dw,src4,result4,zpg);
      std::cout<<GridLogMessage<<"spin "<<s<<" color "<<c<<" norm2(sourc4d) "<<norm2(src4)
               <<" norm2(result4d) "<<norm2(result4)<<std::endl;
      FermToProp<Action>(prop4,result4,s,c);
    }
  }
  auto curr = Current::Vector;
  const int mu_J=0;
  const int t_J=0;
  LatticeComplex    ph (UGrid); ph=1.0;
  Dw.SeqConservedCurrent(prop4,
 			   seqsrc,
 			   phys_src,
 			   curr,
 			   mu_J,
 			   t_J,
 			   t_J,// whole lattice
 			   ph);
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      LatticeFermion src4  (UGrid);
      PropToFerm<Action>(src4,seqsrc,s,c);
      LatticeFermion result4(UGrid); result4=Zero();
      schur(Dw,src4,result4,zpg);
      FermToProp<Action>(seqprop,result4,s,c);
    }
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  Gamma gT(Gamma::Algebra::GammaT);
  std::vector<TComplex> sumSV;
  std::vector<TComplex> sumVV;
  Dw.ContractConservedCurrent(prop4,prop4,Vector_mu,phys_src,Current::Vector,Tdir);
  SV       = trace(Vector_mu);        // Scalar-Vector conserved current
  VV       = trace(gT*Vector_mu);     // (local) Vector-Vector conserved current
  // Spatial sum
  sliceSum(SV,sumSV,Tdir);
  sliceSum(VV,sumVV,Tdir);
  const int Nt{static_cast<int>(sumSV.size())};
  std::cout<<GridLogMessage<<"Vector Ward identity by timeslice (~ 0)"<<std::endl;
  for(int t=0;t<Nt;t++){
    std::cout<<GridLogMessage <<" t "<<t<<" SV "<<real(TensorRemove(sumSV[t]))<<" VV "<<real(TensorRemove(sumVV[t]))<<std::endl;
    assert(abs(real(TensorRemove(sumSV[t]))) < 1e-10);
    assert(abs(real(TensorRemove(sumVV[t]))) < 1e-2);
  }
  ///////////////////////////////
  // 3pt vs 2pt check
  ///////////////////////////////
  {
    Gamma::Algebra        gA = Gamma::Algebra::Identity;
    Gamma                 g(gA);
    LatticePropagator cur(UGrid);
    LatticePropagator tmp(UGrid);
    LatticeComplex c(UGrid);
    SpinColourMatrix qSite;
    peekSite(qSite, seqprop, coor);
    Complex               test_S, test_V, check_S, check_V;
    std::vector<TComplex> check_buf;
    test_S = trace(qSite*g);
    test_V = trace(qSite*g*Gamma::gmu[mu_J]);
    Dw.ContractConservedCurrent(prop4,prop4,cur,phys_src,curr,mu_J);
    c = trace(cur*g);
    sliceSum(c, check_buf, Tp);
    check_S = TensorRemove(check_buf[t_J]);
    auto gmu=Gamma::gmu[mu_J];
    c = trace(cur*g*gmu);
    sliceSum(c, check_buf, Tp);
    check_V = TensorRemove(check_buf[t_J]);
    std::cout<<GridLogMessage << std::setprecision(14)<<"Test S  = " << abs(test_S)   << std::endl;
    std::cout<<GridLogMessage << "Test V  = " << abs(test_V) << std::endl;
    std::cout<<GridLogMessage << "Check S = " << abs(check_S) << std::endl;
    std::cout<<GridLogMessage << "Check V = " << abs(check_V) << std::endl;
    // Check difference = 0
    check_S = check_S - test_S;
    check_V = check_V - test_V;
    std::cout<<GridLogMessage << "Consistency check for sequential conserved " <<std::endl;
    std::cout<<GridLogMessage << "Diff S  = " << abs(check_S) << std::endl;
    assert(abs(check_S) < 1e-8);
    std::cout<<GridLogMessage << "Diff V  = " << abs(check_V) << std::endl;
    assert(abs(check_V) < 1e-8);
  }
 }
--- a/tests/core/Test_wilson_exp_clover.cc
+++ b/tests/core/Test_wilson_exp_clover.cc
@@ -0,0 +1,530 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/core/Test_wilson_exp_clover.cc
    Copyright (C) 2022
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
            Fabian Joswig <fabian.joswig@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 int main(int argc, char **argv)
 {
  Grid_init(&argc, &argv);
  auto latt_size = GridDefaultLatt();
  auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
  auto mpi_layout = GridDefaultMpi();
  GridCartesian Grid(latt_size, simd_layout, mpi_layout);
  GridRedBlackCartesian RBGrid(&Grid);
  int threads = GridThread::GetThreads();
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
  std::cout << GridLogMessage << "Grid floating point word size is REALF" << sizeof(RealF) << std::endl;
  std::cout << GridLogMessage << "Grid floating point word size is REALD" << sizeof(RealD) << std::endl;
  std::cout << GridLogMessage << "Grid floating point word size is REAL" << sizeof(Real) << std::endl;
  std::vector<int> seeds({1, 2, 3, 4});
  GridParallelRNG pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
  typedef typename WilsonExpCloverFermionR::FermionField FermionField;
  typename WilsonExpCloverFermionR::ImplParams params;
  WilsonAnisotropyCoefficients anis;
  FermionField src(&Grid);
  random(pRNG, src);
  FermionField result(&Grid);
  result = Zero();
  FermionField result2(&Grid);
  result2 = Zero();
  FermionField ref(&Grid);
  ref = Zero();
  FermionField tmp(&Grid);
  tmp = Zero();
  FermionField err(&Grid);
  err = Zero();
  FermionField phi(&Grid);
  random(pRNG, phi);
  FermionField chi(&Grid);
  random(pRNG, chi);
  LatticeGaugeField Umu(&Grid);
  SU<Nc>::HotConfiguration(pRNG, Umu);
  std::vector<LatticeColourMatrix> U(4, &Grid);
  double tolerance = 1e-4;
  double volume = 1;
  for (int mu = 0; mu < Nd; mu++)
  {
    volume = volume * latt_size[mu];
  }
  RealD mass = 0.1;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  WilsonExpCloverFermionR Dwc(Umu, Grid, RBGrid, mass, csw_r, csw_t, anis, params);
  CompactWilsonExpCloverFermionR Dwc_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params);
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing that Deo + Doe = Dunprec " << std::endl;
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  FermionField src_e(&RBGrid);
  FermionField src_o(&RBGrid);
  FermionField r_e(&RBGrid);
  FermionField r_o(&RBGrid);
  FermionField r_eo(&Grid);
  pickCheckerboard(Even, src_e, src);
  pickCheckerboard(Odd, src_o, src);
  Dwc.Meooe(src_e, r_o);
  std::cout << GridLogMessage << "Applied Meo" << std::endl;
  Dwc.Meooe(src_o, r_e);
  std::cout << GridLogMessage << "Applied Moe" << std::endl;
  Dwc.Dhop(src, ref, DaggerNo);
  setCheckerboard(r_eo, r_o);
  setCheckerboard(r_eo, r_e);
  err = ref - r_eo;
  std::cout << GridLogMessage << "EO norm diff\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.Meooe(src_e, r_o);
  std::cout << GridLogMessage << "Applied Meo" << std::endl;
  Dwc_compact.Meooe(src_o, r_e);
  std::cout << GridLogMessage << "Applied Moe" << std::endl;
  Dwc_compact.Dhop(src, ref, DaggerNo);
  setCheckerboard(r_eo, r_o);
  setCheckerboard(r_eo, r_e);
  err = ref - r_eo;
  std::cout << GridLogMessage << "EO norm diff compact\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test Ddagger is the dagger of D by requiring                " << std::endl;
  std::cout << GridLogMessage << "=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  " << std::endl;
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  FermionField chi_e(&RBGrid);
  FermionField chi_o(&RBGrid);
  FermionField dchi_e(&RBGrid);
  FermionField dchi_o(&RBGrid);
  FermionField phi_e(&RBGrid);
  FermionField phi_o(&RBGrid);
  FermionField dphi_e(&RBGrid);
  FermionField dphi_o(&RBGrid);
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  Dwc.Meooe(chi_e, dchi_o);
  Dwc.Meooe(chi_o, dchi_e);
  Dwc.MeooeDag(phi_e, dphi_o);
  Dwc.MeooeDag(phi_o, dphi_e);
  ComplexD pDce = innerProduct(phi_e, dchi_e);
  ComplexD pDco = innerProduct(phi_o, dchi_o);
  ComplexD cDpe = innerProduct(chi_e, dphi_e);
  ComplexD cDpo = innerProduct(chi_o, dphi_o);
  std::cout << GridLogMessage << "e " << pDce << " " << cDpe << std::endl;
  std::cout << GridLogMessage << "o " << pDco << " " << cDpo << std::endl;
  std::cout << GridLogMessage << "pDce - conj(cDpo) " << pDce - conj(cDpo) << std::endl;
  std::cout << GridLogMessage << "pDco - conj(cDpe) " << pDco - conj(cDpe) << std::endl;
  Dwc_compact.Meooe(chi_e, dchi_o);
  Dwc_compact.Meooe(chi_o, dchi_e);
  Dwc_compact.MeooeDag(phi_e, dphi_o);
  Dwc_compact.MeooeDag(phi_o, dphi_e);
  pDce = innerProduct(phi_e, dchi_e);
  pDco = innerProduct(phi_o, dchi_o);
  cDpe = innerProduct(chi_e, dphi_e);
  cDpo = innerProduct(chi_o, dphi_o);
  std::cout << GridLogMessage << "e compact " << pDce << " " << cDpe << std::endl;
  std::cout << GridLogMessage << "o compact " << pDco << " " << cDpo << std::endl;
  std::cout << GridLogMessage << "pDce - conj(cDpo) compact " << pDce - conj(cDpo) << std::endl;
  std::cout << GridLogMessage << "pDco - conj(cDpe) compact " << pDco - conj(cDpe) << std::endl;
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test MeeInv Mee = 1   (if csw!=0)                           " << std::endl;
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  Dwc.Mooee(chi_e, src_e);
  Dwc.MooeeInv(src_e, phi_e);
  Dwc.Mooee(chi_o, src_o);
  Dwc.MooeeInv(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.Mooee(chi_e, src_e);
  Dwc_compact.MooeeInv(src_e, phi_e);
  Dwc_compact.Mooee(chi_o, src_o);
  Dwc_compact.MooeeInv(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test MeeDag MeeInvDag = 1    (if csw!=0)                    " << std::endl;
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  Dwc.MooeeDag(chi_e, src_e);
  Dwc.MooeeInvDag(src_e, phi_e);
  Dwc.MooeeDag(chi_o, src_o);
  Dwc.MooeeInvDag(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.MooeeDag(chi_e, src_e);
  Dwc_compact.MooeeInvDag(src_e, phi_e);
  Dwc_compact.MooeeDag(chi_o, src_o);
  Dwc_compact.MooeeInvDag(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  std::cout << GridLogMessage << "= Test MeeInv MeeDag = 1      (if csw!=0)                     " << std::endl;
  std::cout << GridLogMessage << "==============================================================" << std::endl;
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  Dwc.MooeeDag(chi_e, src_e);
  Dwc.MooeeInv(src_e, phi_e);
  Dwc.MooeeDag(chi_o, src_o);
  Dwc.MooeeInv(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Dwc_compact.MooeeDag(chi_e, src_e);
  Dwc_compact.MooeeInv(src_e, phi_e);
  Dwc_compact.MooeeDag(chi_o, src_o);
  Dwc_compact.MooeeInv(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = phi - chi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "================================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing gauge covariance Clover term with EO preconditioning  " << std::endl;
  std::cout << GridLogMessage << "================================================================" << std::endl;
  chi = Zero();
  phi = Zero();
  tmp = Zero();
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  Dwc.Mooee(src_e, chi_e);
  Dwc.Mooee(src_o, chi_o);
  setCheckerboard(chi, chi_e);
  setCheckerboard(chi, chi_o);
  setCheckerboard(src, src_e);
  setCheckerboard(src, src_o);
  ////////////////////// Gauge Transformation
  std::vector<int> seeds2({5, 6, 7, 8});
  GridParallelRNG pRNG2(&Grid);
  pRNG2.SeedFixedIntegers(seeds2);
  LatticeColourMatrix Omega(&Grid);
  LatticeColourMatrix ShiftedOmega(&Grid);
  LatticeGaugeField U_prime(&Grid);
  U_prime = Zero();
  LatticeColourMatrix U_prime_mu(&Grid);
  U_prime_mu = Zero();
  SU<Nc>::LieRandomize(pRNG2, Omega, 1.0);
  for (int mu = 0; mu < Nd; mu++)
  {
    U[mu] = peekLorentz(Umu, mu);
    ShiftedOmega = Cshift(Omega, mu, 1);
    U_prime_mu = Omega * U[mu] * adj(ShiftedOmega);
    pokeLorentz(U_prime, U_prime_mu, mu);
  }
  /////////////////
  WilsonExpCloverFermionR Dwc_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, anis, params);
  CompactWilsonExpCloverFermionR Dwc_compact_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params);
  tmp = Omega * src;
  pickCheckerboard(Even, src_e, tmp);
  pickCheckerboard(Odd, src_o, tmp);
  Dwc_prime.Mooee(src_e, phi_e);
  Dwc_prime.Mooee(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  tmp = Zero();
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  Dwc_compact.Mooee(src_e, chi_e);
  Dwc_compact.Mooee(src_o, chi_o);
  setCheckerboard(chi, chi_e);
  setCheckerboard(chi, chi_o);
  setCheckerboard(src, src_e);
  setCheckerboard(src, src_o);
  tmp = Omega * src;
  pickCheckerboard(Even, src_e, tmp);
  pickCheckerboard(Odd, src_o, tmp);
  Dwc_compact_prime.Mooee(src_e, phi_e);
  Dwc_compact_prime.Mooee(src_o, phi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "=================================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing gauge covariance Clover term w/o EO preconditioning  " << std::endl;
  std::cout << GridLogMessage << "================================================================" << std::endl;
  chi = Zero();
  phi = Zero();
  WilsonFermionR Dw(Umu, Grid, RBGrid, mass, params);
  Dw.M(src, result);
  Dwc.M(src, chi);
  Dwc_prime.M(Omega * src, phi);
  WilsonFermionR Dw_prime(U_prime, Grid, RBGrid, mass, params);
  Dw_prime.M(Omega * src, result2);
  err = result - adj(Omega) * result2;
  std::cout << GridLogMessage << "norm diff Wilson                 " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff WilsonExpClover        " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  Dwc_compact.M(src, chi);
  Dwc_compact_prime.M(Omega * src, phi);
  err = chi - adj(Omega) * phi;
  std::cout << GridLogMessage << "norm diff CompactWilsonExpClover " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing Mooee(csw=0) Clover to reproduce Mooee Wilson   " << std::endl;
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  chi = Zero();
  phi = Zero();
  err = Zero();
  WilsonExpCloverFermionR Dwc_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, anis, params); //  <-- Notice: csw=0
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  Dw.Mooee(src_e, chi_e);
  Dw.Mooee(src_o, chi_o);
  Dwc_csw0.Mooee(src_e, phi_e);
  Dwc_csw0.Mooee(src_o, phi_o);
  setCheckerboard(chi, chi_e);
  setCheckerboard(chi, chi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  setCheckerboard(src, src_e);
  setCheckerboard(src, src_o);
  err = chi - phi;
  std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  err = Zero();
  CompactWilsonExpCloverFermionR Dwc_compact_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, 1.0, anis, params); //  <-- Notice: csw=0
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  Dw.Mooee(src_e, chi_e);
  Dw.Mooee(src_o, chi_o);
  Dwc_compact_csw0.Mooee(src_e, phi_e);
  Dwc_compact_csw0.Mooee(src_o, phi_o);
  setCheckerboard(chi, chi_e);
  setCheckerboard(chi, chi_o);
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  setCheckerboard(src, src_e);
  setCheckerboard(src, src_o);
  err = chi - phi;
  std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  std::cout << GridLogMessage << "= Testing EO operator is equal to the unprec              " << std::endl;
  std::cout << GridLogMessage << "==========================================================" << std::endl;
  chi = Zero();
  phi = Zero();
  err = Zero();
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  // M phi = (Mooee src_e + Meooe src_o , Meooe src_e + Mooee src_o)
  Dwc.M(src, ref); // Reference result from the unpreconditioned operator
  // EO matrix
  Dwc.Mooee(src_e, chi_e); 
  Dwc.Mooee(src_o, chi_o);
  Dwc.Meooe(src_o, phi_e);
  Dwc.Meooe(src_e, phi_o);
  phi_o += chi_o;
  phi_e += chi_e;
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = ref - phi;
  std::cout << GridLogMessage << "ref (unpreconditioned operator) diff         : " << norm2(ref) << std::endl;
  std::cout << GridLogMessage << "phi (EO decomposition)          diff         : " << norm2(phi) << std::endl;
  std::cout << GridLogMessage << "norm diff                                    : " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  chi = Zero();
  phi = Zero();
  err = Zero();
  pickCheckerboard(Even, phi_e, phi);
  pickCheckerboard(Odd, phi_o, phi);
  pickCheckerboard(Even, chi_e, chi);
  pickCheckerboard(Odd, chi_o, chi);
  // M phi = (Mooee src_e + Meooe src_o , Meooe src_e + Mooee src_o)
  Dwc_compact.M(src, ref); // Reference result from the unpreconditioned operator
  // EO matrix
  Dwc_compact.Mooee(src_e, chi_e); 
  Dwc_compact.Mooee(src_o, chi_o);
  Dwc_compact.Meooe(src_o, phi_e);
  Dwc_compact.Meooe(src_e, phi_o);
  phi_o += chi_o;
  phi_e += chi_e;
  setCheckerboard(phi, phi_e);
  setCheckerboard(phi, phi_o);
  err = ref - phi;
  std::cout << GridLogMessage << "ref (unpreconditioned operator) diff compact : " << norm2(ref) << std::endl;
  std::cout << GridLogMessage << "phi (EO decomposition)          diff compact : " << norm2(phi) << std::endl;
  std::cout << GridLogMessage << "norm diff compact                            : " << norm2(err) << std::endl;
  assert(fabs(norm2(err)) < tolerance);
  Grid_finalize();
 }
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -132,8 +132,8 @@ int main(int argc, char **argv) {
  // Checkpointer definition
  CheckpointerParameters CPparams(Reader);
-  //TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
+  TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
-  TheHMC.Resources.LoadScidacCheckpointer(CPparams, SPar);
+  //TheHMC.Resources.LoadScidacCheckpointer(CPparams, SPar); this breaks for compilation without lime
  RNGModuleParameters RNGpar(Reader);
  TheHMC.Resources.SetRNGSeeds(RNGpar);
--- a/tests/hmc/Test_hmc_WG_Production.cc
+++ b/tests/hmc/Test_hmc_WG_Production.cc
@@ -74,10 +74,10 @@ int main(int argc, char **argv) {
  // Checkpointer definition
  CheckpointerParameters CPparams(Reader);
-  //TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
-  // Store metadata in the Scidac checkpointer
+  // Store metadata in the Scidac checkpointer - obviously breaks without LIME
-  TheHMC.Resources.LoadScidacCheckpointer(CPparams, WilsonPar);
+  //TheHMC.Resources.LoadScidacCheckpointer(CPparams, WilsonPar);
  RNGModuleParameters RNGpar(Reader);
  TheHMC.Resources.SetRNGSeeds(RNGpar);
--- a/tests/lanczos/Test_compressed_lanczos.cc
+++ b/tests/lanczos/Test_compressed_lanczos.cc
@@ -37,6 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 #ifdef HAVE_LIME
 template<class Fobj,class CComplex,int nbasis>
 class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
 { 
@@ -249,3 +251,11 @@ int main (int argc, char ** argv) {
  Grid_finalize();
 }
 #else
 int main( void )
 {
  return 0 ;
 }
 #endif // HAVE_LIME_H
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@@ -36,7 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
- ;
+
 #ifdef HAVE_LIME
 template<class Fobj,class CComplex,int nbasis>
 class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
@@ -250,3 +251,11 @@ int main (int argc, char ** argv) {
  Grid_finalize();
 }
 #else
 int main( void )
 {
  return 0 ;
 }
 #endif
--- a/tests/solver/Test_coarse_even_odd.cc
+++ b/tests/solver/Test_coarse_even_odd.cc
@@ -93,8 +93,16 @@ int main(int argc, char** argv) {
  //                    Setup of Dirac Matrix and Operator                   //
  /////////////////////////////////////////////////////////////////////////////
-  LatticeGaugeField Umu(Grid_f); SU3::HotConfiguration(pRNG_f, Umu);
+  LatticeGaugeField Umu(Grid_f);
-
+#if (Nc==2)
  SU2::HotConfiguration(pRNG_f, Umu);
 #elif (defined Nc==3)
  SU3::HotConfiguration(pRNG_f, Umu);
 #elif (defined Nc==4)
  SU4::HotConfiguration(pRNG_f, Umu);
 #elif (defined Nc==5)
  SU5::HotConfiguration(pRNG_f, Umu);
 #endif
  RealD checkTolerance = (getPrecision<LatticeFermion>::value == 1) ? 1e-7 : 1e-15;
  RealD mass = -0.30;
--- a/tests/solver/Test_dwf_mrhs_cg.cc
+++ b/tests/solver/Test_dwf_mrhs_cg.cc
@@ -34,6 +34,7 @@ using namespace Grid;
 int main (int argc, char ** argv)
 {
 #ifdef HAVE_LIME 
  typedef typename DomainWallFermionR::FermionField FermionField; 
  typedef typename DomainWallFermionR::ComplexField ComplexField; 
  typename DomainWallFermionR::ImplParams params; 
@@ -237,4 +238,5 @@ int main (int argc, char ** argv)
  }
  Grid_finalize();
 #endif // HAVE_LIME
 }
--- a/tests/solver/Test_wilsonclover_cg_prec.cc
+++ b/tests/solver/Test_wilsonclover_cg_prec.cc
@@ -71,7 +71,12 @@ int main (int argc, char ** argv)
  RealD mass = -0.1;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  RealD cF = 1.0;
  WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t);
  CompactWilsonCloverFermionR Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0);
  WilsonExpCloverFermionR Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t);
  CompactWilsonExpCloverFermionR Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0);
  //  HermitianOperator<WilsonFermion,LatticeFermion> HermOp(Dw);
  //  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
@@ -80,12 +85,28 @@ int main (int argc, char ** argv)
  LatticeFermion    src_o(&RBGrid);
  LatticeFermion result_o(&RBGrid);
  pickCheckerboard(Odd,src_o,src);
  result_o=Zero();
  SchurDiagMooeeOperator<WilsonCloverFermionR,LatticeFermion> HermOpEO(Dw);
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  std::cout << GridLogMessage << "Testing Wilson Clover" << std::endl;
  SchurDiagMooeeOperator<WilsonCloverFermionR,LatticeFermion> HermOpEO(Dw);
  result_o=Zero();
  CG(HermOpEO,src_o,result_o);
  std::cout << GridLogMessage << "Testing Compact Wilson Clover" << std::endl;
  SchurDiagMooeeOperator<CompactWilsonCloverFermionR,LatticeFermion> HermOpEO_compact(Dw_compact);
  result_o=Zero();
  CG(HermOpEO_compact,src_o,result_o);
  std::cout << GridLogMessage << "Testing Wilson Exp Clover" << std::endl;
  SchurDiagMooeeOperator<WilsonExpCloverFermionR,LatticeFermion> HermOpEO_exp(Dwe);
  result_o=Zero();
  CG(HermOpEO_exp,src_o,result_o);
  std::cout << GridLogMessage << "Testing Compact Wilson Exp Clover" << std::endl;
  SchurDiagMooeeOperator<CompactWilsonExpCloverFermionR,LatticeFermion> HermOpEO_exp_compact(Dwe_compact);
  result_o=Zero();
  CG(HermOpEO_exp_compact,src_o,result_o);
  Grid_finalize();
 }
--- a/tests/solver/Test_wilsonclover_cg_schur.cc
+++ b/tests/solver/Test_wilsonclover_cg_schur.cc
@@ -60,18 +60,36 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu);
  LatticeFermion    src(&Grid); random(pRNG,src);
-  LatticeFermion result(&Grid); result=Zero();
+  LatticeFermion result(&Grid);
  LatticeFermion resid(&Grid);
  RealD mass = -0.1;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t);
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
  RealD mass = -0.1;
  RealD csw_r = 1.0;
  RealD csw_t = 1.0;
  RealD cF = 1.0;
  std::cout << GridLogMessage << "Testing Wilson Clover" << std::endl;
  WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t);
  result=Zero();
  SchurSolver(Dw,src,result);
  std::cout << GridLogMessage << "Testing Compact Wilson Clover" << std::endl;
  CompactWilsonCloverFermionR Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0);
  result=Zero();
  SchurSolver(Dw_compact,src,result);
  std::cout << GridLogMessage << "Testing Wilson Exp Clover" << std::endl;
  WilsonExpCloverFermionR Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t);
  result=Zero();
  SchurSolver(Dwe,src,result);
  std::cout << GridLogMessage << "Testing Compact Wilson Exp Clover" << std::endl;
  CompactWilsonExpCloverFermionR Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0);
  result=Zero();
  SchurSolver(Dwe_compact,src,result);
  Grid_finalize();
 }
--- a/Show More
+++ b/Show More
`@@ -1,4 +1,4 @@`

	`export CRAY_ACCEL_TARGET=nvidia80`	`export CRAY_ACCEL_TARGET=nvidia80`

	`module load PrgEnv-gnu cpe-cuda cuda`	`module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4`