Merge pull request #409 from giltirn/feature/dirichlet-gparity-stage

Import round 5
Imported changes from feature/gparity_HMC branch:
2026-05-03 08:54:12 +01:00 · 2022-08-31 18:22:50 -04:00 · 2022-07-01 14:12:12 -04:00 · 2022-07-01 12:12:50 -04:00 · 2022-07-01 09:55:43 -04:00 · 2022-07-01 09:44:58 -04:00
104 changed files with 6743 additions and 1359 deletions
@@ -44,14 +44,22 @@ directory
 #ifdef __NVCC__
 //disables nvcc specific warning in json.hpp
 #pragma clang diagnostic ignored "-Wdeprecated-register"
+
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
+ //disables nvcc specific warning in json.hpp
+#pragma nv_diag_suppress unsigned_compare_with_zero
+#pragma nv_diag_suppress cast_to_qualified_type
+ //disables nvcc specific warning in many files
+#pragma nv_diag_suppress esa_on_defaulted_function_ignored
+#pragma nv_diag_suppress extra_semicolon
+#else
+ //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
-
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
-
-//Eigen only
+#endif
 #endif

 // Disable vectorisation in Eigen on the Power8/9 and PowerPC
@@ -14,7 +14,11 @@
 /* NVCC save and restore compile environment*/
 #ifdef __NVCC__
 #pragma push
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
+#pragma nv_diag_suppress code_is_unreachable
+#else
 #pragma diag_suppress code_is_unreachable
+#endif
 #pragma push_macro("__CUDA_ARCH__")
 #pragma push_macro("__NVCC__")
 #pragma push_macro("__CUDACC__")
@@ -120,6 +120,9 @@ public:
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
+
+      GridStopWatch IterationTimer;
+      IterationTimer.Start();
      c = cp;

      MatrixTimer.Start();
@@ -152,8 +155,14 @@ public:
      LinearCombTimer.Stop();
      LinalgTimer.Stop();

-      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+      IterationTimer.Stop();
+      if ( (k % 500) == 0 ) {
+	std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+      } else { 
+	std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+		  << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
+      }

      // Stopping condition
      if (cp <= rsq) {
@@ -170,13 +179,13 @@ public:
 		  << "\tTrue residual " << true_residual
 		  << "\tTarget " << Tolerance << std::endl;

-        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
-	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

@@ -44,7 +44,7 @@ public:

  using OperatorFunction<Field>::operator();

-  RealD   Tolerance;
+  //  RealD   Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -324,8 +324,8 @@ public:

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
-      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY     " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix   " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;

      IterationsToComplete = k;	
@@ -113,7 +113,43 @@ public:
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
-};
+
+  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
+    int Nevec = (int)evec_coarse.size();
+    int Nsrc = (int)src.size();
+    // make temp variables
+    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
+    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
+    //Preporcessing
+    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
+    for (int j=0;j<Nsrc;j++)
+    {
+    guess_coarse[j] = Zero();
+    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
+    blockProject(src_coarse[j],src[j],subspace);
+    }
+    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
+    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
+    for (int i=0;i<Nevec;i++)
+    {
+      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
+      const CoarseField & tmp = evec_coarse[i];
+      for (int j=0;j<Nsrc;j++)
+      {
+        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
+      }
+    }
+    //postprocessing
+    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
+    for (int j=0;j<Nsrc;j++)
+    {
+    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
+    blockPromote(guess_coarse[j],guess[j],subspace);
+    guess[j].Checkerboard() = src[j].Checkerboard();
+    }
+  };
+
+  };



@@ -146,14 +146,21 @@ public:
  LinearOperatorBase<FineField> &_Linop;
  RealD                             _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
+
+  int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
+                                //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
+                                //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
+                                //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
+                                //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
  
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
-					   RealD coarse_relax_tol=5.0e3) 
+					   RealD coarse_relax_tol=5.0e3,
+					   int largestEvalIdxForReport=-1) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
-      _coarse_relax_tol(coarse_relax_tol)  
+      _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
  {    };

  //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
@@ -179,6 +186,12 @@ public:
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;

+    if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
+      std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
+      RealD tmp_eval;
+      ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
+    }
+    
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
@@ -409,7 +422,7 @@ public:
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); 
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); 

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
@@ -372,7 +372,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;

-  if ( dox ) {
+  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
      ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
@@ -382,7 +382,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    }
  }
  
-  if (dor) {
+  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@@ -390,16 +390,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
      list.push_back(xrq);
      off_node_bytes+=bytes;
    } else {
-    // TODO : make a OMP loop on CPU, call threaded bcopy
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    }
  }
  
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
    this->StencilSendToRecvFromComplete(list,dir);
+    list.resize(0);
  }

  return off_node_bytes;
@@ -125,6 +125,12 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
+template<class vobj>
+typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
+  typename vobj::scalar_object s;
+  peekSite(s,l,site);
+  return s;
+}        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
        
@@ -232,6 +232,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  
  // Might make all code paths go this way.
+#if 0
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
@@ -241,15 +242,31 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);

    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, 1,{
-	auto x_l = left_v[ss];
-	auto y_l = right_v[ss];
-	inner_tmp_v[ss]=innerProductD(x_l,y_l);
+    accelerator_for( ss, sites, nsimd,{
+	auto x_l = left_v(ss);
+	auto y_l = right_v(ss);
+	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
    });
  }
+#else
+  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
+  Vector<inner_t> inner_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+    
+  {
+    autoView( left_v , left, AcceleratorRead);
+    autoView( right_v,right, AcceleratorRead);

+    // GPU - SIMT lane compliance...
+    accelerator_for( ss, sites, nsimd,{
+	auto x_l = left_v(ss);
+	auto y_l = right_v(ss);
+	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
+    });
+  }
+#endif
  // This is in single precision and fails some tests
-  auto anrm = sum(inner_tmp_v,sites);  
+  auto anrm = sumD(inner_tmp_v,sites);  
  nrm = anrm;
  return nrm;
 }
@@ -283,7 +300,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  conformable(x,y);

  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_typeD vector_type;
+  //  typedef typename vobj::vector_typeD vector_type;
  RealD  nrm;
  
  GridBase *grid = x.Grid();
@@ -295,17 +312,29 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( x_v, x, AcceleratorRead);
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);
-
+#if 0
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];

-  accelerator_for( ss, sites, 1,{
-      auto tmp = a*x_v[ss]+b*y_v[ss];
-      inner_tmp_v[ss]=innerProductD(tmp,tmp);
-      z_v[ss]=tmp;
+  accelerator_for( ss, sites, nsimd,{
+      auto tmp = a*x_v(ss)+b*y_v(ss);
+      coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
+      coalescedWrite(z_v[ss],tmp);
  });
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
+#else
+  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
+  Vector<inner_t> inner_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+
+  accelerator_for( ss, sites, nsimd,{
+      auto tmp = a*x_v(ss)+b*y_v(ss);
+      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
+      coalescedWrite(z_v[ss],tmp);
+  });
+  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+#endif
  grid->GlobalSum(nrm);
  return nrm; 
 }
@@ -424,9 +424,32 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
+#if 1
+    thread_for( lidx, _grid->lSites(), {

+	int gidx;
+	int o_idx;
+	int i_idx;
+	int rank;
+	Coordinate pcoor;
+	Coordinate lcoor;
+	Coordinate gcoor;
+	_grid->LocalIndexToLocalCoor(lidx,lcoor);
+	pcoor=_grid->ThisProcessorCoor();
+	_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
+	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
+
+	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+	assert(rank == _grid->ThisRank() );
+	
+	int l_idx=generator_idx(o_idx,i_idx);
+	_generators[l_idx] = master_engine;
+	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
+    });
+#else
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
+
 	// Where is it?
 	int rank;
 	int o_idx;
@@ -443,6 +466,7 @@ public:
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
+#endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
@@ -65,6 +65,7 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
 GridLogger GridLogError  (1, "Error" , GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
+GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
 GridLogger GridLogDebug  (1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative  (1, "Iterative", GridLogColours, "BLUE");
@@ -72,9 +73,10 @@ GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
 GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");

 void GridLogConfigure(std::vector<std::string> &logstreams) {
-  GridLogError.Active(0);
+  GridLogError.Active(1);
  GridLogWarning.Active(0);
  GridLogMessage.Active(1); // at least the messages should be always on
+  GridLogMemory.Active(0); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
@@ -83,7 +85,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogHMC.Active(1);

  for (int i = 0; i < logstreams.size(); i++) {
-    if (logstreams[i] == std::string("Error"))       GridLogError.Active(1);
+    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
@@ -183,6 +183,7 @@ extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern GridLogger GridLogHMC;
+extern GridLogger GridLogMemory;
 extern Colours    GridLogColours;

 std::string demangle(const char* name) ;
@@ -31,6 +31,7 @@ directory
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <string>
 #include <map>

 #include <pwd.h>
@@ -654,7 +655,8 @@ class IldgWriter : public ScidacWriter {
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
-    ildgfmt.field     = std::string("su3gauge");
+    const std::string stNC = std::to_string( Nc ) ;
+    ildgfmt.field          = std::string("su"+stNC+"gauge");

    if ( format == std::string("IEEE32BIG") ) { 
      ildgfmt.precision = 32;
@@ -871,7 +873,8 @@ class IldgReader : public GridLimeReader {
    } else { 

      assert(found_ildgFormat);
-      assert ( ildgFormat_.field == std::string("su3gauge") );
+      const std::string stNC = std::to_string( Nc ) ;
+      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );

      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
@@ -879,7 +882,7 @@ class IldgReader : public GridLimeReader {

      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
-      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
+      FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);

      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
@@ -6,8 +6,8 @@

    Copyright (C) 2015

-
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -182,8 +182,8 @@ class GaugeStatistics
 public:
  void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
  {
-    header.link_trace=WilsonLoops<Impl>::linkTrace(data);
-    header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
+    header.link_trace = WilsonLoops<Impl>::linkTrace(data);
+    header.plaquette  = WilsonLoops<Impl>::avgPlaquette(data);
  }
 };
 typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  const int x=0;
-  const int y=1;
-  const int z=2;
+  assert( Nc < 4 && Nc > 1 ) ;
  for(int mu=0;mu<Nd;mu++){
-    cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
-    cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
-    cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+    #if Nc == 2
+      cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
+      cm(mu)()(1,1) =  adj(cm(mu)()(0,x)) ;
+    #else
+      const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
+      cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
+      cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
+      cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
+    #endif
  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Some data types for intermediate storage
 ////////////////////////////////////////////////////////////////////////////////
-template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
+template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;

 typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
 typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -278,7 +282,6 @@ struct GaugeSimpleMunger{

 template <class fobj, class sobj>
 struct GaugeSimpleUnmunger {
-
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nd; mu++) {
      for (int i = 0; i < Nc; i++) {
@@ -317,8 +320,8 @@ template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
-	for(int j=0;j<3;j++){
+      for(int i=0;i<Nc-1;i++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)()(i,j) = in(mu)(i)(j);
 	}}
    }
@@ -330,8 +333,8 @@ template<class fobj,class sobj>
 struct Gauge3x2unmunger{
  void operator() (sobj &in,fobj &out){
    for(int mu=0;mu<Nd;mu++){
-      for(int i=0;i<2;i++){
-	for(int j=0;j<3;j++){
+      for(int i=0;i<Nc-1;i++){
+	for(int j=0;j<Nc;j++){
 	  out(mu)(i)(j) = in(mu)()(i,j);
 	}}
    }
@@ -9,6 +9,7 @@
    Author: Matt Spraggs <matthew.spraggs@gmail.com>
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,6 +31,8 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H

+#include <string>
+
 NAMESPACE_BEGIN(Grid);

 using namespace Grid;
@@ -147,15 +150,17 @@ public:

    std::string format(header.floating_point);

-    int ieee32big = (format == std::string("IEEE32BIG"));
-    int ieee32    = (format == std::string("IEEE32"));
-    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
+    const int ieee32big = (format == std::string("IEEE32BIG"));
+    const int ieee32    = (format == std::string("IEEE32"));
+    const int ieee64big = (format == std::string("IEEE64BIG"));
+    const int ieee64    = (format == std::string("IEEE64") || \
+			   format == std::string("IEEE64LITTLE"));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
    // munger is a function of <floating point, Real, data_type>
-    if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
+    const std::string stNC = std::to_string( Nc ) ;
+    if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> 
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -166,7 +171,7 @@ public:
 	  (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
 	   nersc_csum,scidac_csuma,scidac_csumb);
      }
-    } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
+    } else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
      if ( ieee32 || ieee32big ) {
 	BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
 	  (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -211,27 +216,29 @@ public:
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
+					std::string ens_id = std::string("UKQCD"),
+					unsigned int sequence_number = 1)
  {
-    writeConfiguration(Umu,file,0,1,ens_label);
+    writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
 					int bits32,
-					std::string ens_label = std::string("DWF"))
+					std::string ens_label = std::string("DWF"),
+					std::string ens_id = std::string("UKQCD"),
+					unsigned int sequence_number = 1)
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;

    FieldMetaData header;
-    ///////////////////////////////////////////
-    // Following should become arguments
-    ///////////////////////////////////////////
-    header.sequence_number = 1;
-    header.ensemble_id     = std::string("UKQCD");
+    header.sequence_number = sequence_number;
+    header.ensemble_id     = ens_id;
    header.ensemble_label  = ens_label;
+    header.hdr_version     = "1.0" ;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -245,10 +252,14 @@ public:

    uint64_t offset;

-    // Sod it -- always write 3x3 double
-    header.floating_point = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
-    GaugeSimpleUnmunger<fobj3D,sobj> munge;
+    // Sod it -- always write NcxNc double
+    header.floating_point  = std::string("IEEE64BIG");
+    const std::string stNC = std::to_string( Nc ) ;
+    if( two_row ) {
+      header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
+    } else {
+      header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
+    }
    if ( grid->IsBoss() ) { 
      truncate(file);
      offset = writeHeader(header,file);
@@ -256,8 +267,15 @@ public:
    grid->Broadcast(0,(void *)&offset,sizeof(offset));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-					      nersc_csum,scidac_csuma,scidac_csumb);
+    if( two_row ) {
+      Gauge3x2unmunger<fobj2D,sobj> munge;
+      BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
+						nersc_csum,scidac_csuma,scidac_csumb);
+    } else {
+      GaugeSimpleUnmunger<fobj3D,sobj> munge;
+      BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
+						nersc_csum,scidac_csuma,scidac_csumb);
+    }
    header.checksum = nersc_csum;
    if ( grid->IsBoss() ) { 
      writeHeader(header,file);
@@ -289,8 +307,7 @@ public:
    header.plaquette=0.0;
    MachineCharacteristics(header);

-	uint64_t offset;
-  
+    uint64_t offset;
 #ifdef RNG_RANLUX
    header.floating_point = std::string("UINT64");
    header.data_type      = std::string("RANLUX48");
@@ -330,7 +347,7 @@ public:

    GridBase *grid = parallel.Grid();

-	uint64_t offset = readHeader(file,grid,header);
+    uint64_t offset = readHeader(file,grid,header);

    FieldMetaData clone(header);

@@ -72,17 +72,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 inline uint64_t cyclecount(void){ 
  return 0;
 }
-#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
-#define __SSC_STOP  __SSC_MARK(0x110)
-#define __SSC_START __SSC_MARK(0x111)
-

 #else

-#define __SSC_MARK(mark) 
-#define __SSC_STOP  
-#define __SSC_START 
-
 /*
 * cycle counters arch dependent
 */
@@ -39,9 +39,9 @@ NAMESPACE_BEGIN(Grid)
 // C++11 time facilities better?
 inline double usecond(void) {
  struct timeval tv;
-#ifdef TIMERS_ON
+  tv.tv_sec = 0;
+  tv.tv_usec = 0;
  gettimeofday(&tv,NULL);
-#endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }

@@ -16,8 +16,12 @@

 #ifdef __NVCC__
 #pragma push
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
+#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
+#else
 #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 #endif
+#endif

 #include "pugixml.h"

@@ -38,24 +38,32 @@ NAMESPACE_BEGIN(Grid);
 struct GparityWilsonImplParams {
  Coordinate twists;
                     //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
-  GparityWilsonImplParams() : twists(Nd, 0) {};
+  Coordinate dirichlet; // Blocksize of dirichlet BCs
+  GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {};
 };
  
 struct WilsonImplParams {
  bool overlapCommsCompute;
+  Coordinate dirichlet; // Blocksize of dirichlet BCs
  AcceleratorVector<Real,Nd> twist_n_2pi_L;
  AcceleratorVector<Complex,Nd> boundary_phases;
  WilsonImplParams()  {
+    dirichlet.resize(Nd,0);
    boundary_phases.resize(Nd, 1.0);
      twist_n_2pi_L.resize(Nd, 0.0);
  };
  WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
    twist_n_2pi_L.resize(Nd, 0.0);
+    dirichlet.resize(Nd,0);
  }
 };

 struct StaggeredImplParams {
-  StaggeredImplParams()  {};
+  Coordinate dirichlet; // Blocksize of dirichlet BCs
+  StaggeredImplParams()
+  {
+    dirichlet.resize(Nd,0);
+  };
 };
  
  struct OneFlavourRationalParams : Serializable {
@@ -68,9 +68,17 @@ public:
  ///////////////////////////////////////////////////////////////
  // Support for MADWF tricks
  ///////////////////////////////////////////////////////////////
-  virtual RealD Mass(void) { return mass; };
+  RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
+  RealD MassPlus(void) { return mass_plus; };
+  RealD MassMinus(void) { return mass_minus; };
+
  void  SetMass(RealD _mass) { 
-    mass=_mass; 
+    mass_plus=mass_minus=_mass; 
+    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
+  } ;
+  void  SetMass(RealD _mass_plus, RealD _mass_minus) { 
+    mass_plus=_mass_plus;
+    mass_minus=_mass_minus;
    SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
  } ;
  void  P(const FermionField &psi, FermionField &chi);
@@ -108,7 +116,7 @@ public:
  void   MeooeDag5D    (const FermionField &in, FermionField &out);

  //    protected:
-  RealD mass;
+  RealD mass_plus, mass_minus;

  // Save arguments to SetCoefficientsInternal
  Vector<Coeff_t> _gamma;
@@ -0,0 +1,435 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
+
+    Copyright (C) 2017 - 2022
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+
+////////////////////////////////////////////
+// Standard Clover
+//   (4+m0) + csw * clover_term
+// Exp Clover
+//   (4+m0) * exp(csw/(4+m0) clover_term)
+//   = (4+m0) + csw * clover_term + ...
+////////////////////////////////////////////
+
+NAMESPACE_BEGIN(Grid);
+
+
+//////////////////////////////////
+// Generic Standard Clover
+//////////////////////////////////
+
+template<class Impl>
+class CloverHelpers: public WilsonCloverHelpers<Impl> {
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+
+  typedef WilsonCloverHelpers<Impl> Helpers;
+
+  static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
+    GridBase *grid = CloverTerm.Grid();
+    CloverTerm += diag_mass;
+
+    int lvol = grid->lSites();
+    int DimRep = Impl::Dimension;
+    {
+      autoView(CTv,CloverTerm,CpuRead);
+      autoView(CTIv,CloverTermInv,CpuWrite);
+      thread_for(site, lvol, {
+        Coordinate lcoor;
+        grid->LocalIndexToLocalCoor(site, lcoor);
+        Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+        Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+        typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
+        peekLocalSite(Qx, CTv, lcoor);
+
+        for (int j = 0; j < Ns; j++)
+          for (int k = 0; k < Ns; k++)
+            for (int a = 0; a < DimRep; a++)
+              for (int b = 0; b < DimRep; b++){
+                auto zz =  Qx()(j, k)(a, b);
+                EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
+              }
+
+        EigenInvCloverOp = EigenCloverOp.inverse();
+        for (int j = 0; j < Ns; j++)
+          for (int k = 0; k < Ns; k++)
+            for (int a = 0; a < DimRep; a++)
+              for (int b = 0; b < DimRep; b++)
+                Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+               pokeLocalSite(Qxinv, CTIv, lcoor);
+      });
+    }
+  }
+
+  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
+    return Helpers::Cmunu(U, lambda, mu, nu);
+  }
+
+};
+
+
+//////////////////////////////////
+// Generic Exp Clover
+//////////////////////////////////
+
+template<class Impl>
+class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+
+  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+  typedef WilsonCloverHelpers<Impl> Helpers;
+
+  // Can this be avoided?
+  static void IdentityTimesC(const CloverField& in, RealD c) {
+    int DimRep = Impl::Dimension;
+
+    autoView(in_v, in, AcceleratorWrite);
+
+    accelerator_for(ss, in.Grid()->oSites(), 1, {
+      for (int sa=0; sa<Ns; sa++)
+        for (int ca=0; ca<DimRep; ca++)
+          in_v[ss]()(sa,sa)(ca,ca) = c;
+    });
+  }
+
+  static int getNMAX(RealD prec, RealD R) {
+    /* compute stop condition for exponential */
+    int NMAX=1;
+    RealD cond=R*R/2.;
+
+    while (cond*std::exp(R)>prec) {
+      NMAX++;
+      cond*=R/(double)(NMAX+1);
+    }
+    return NMAX;
+  }
+
+  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
+  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
+
+  static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
+    GridBase* grid = Clover.Grid();
+    CloverField ExpClover(grid);
+
+    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
+
+    Clover *= (1.0/diag_mass);
+
+    // Taylor expansion, slow but generic
+    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
+    // qN = cN
+    // qn = cn + qn+1 X
+    std::vector<RealD> cn(NMAX+1);
+    cn[0] = 1.0;
+    for (int i=1; i<=NMAX; i++)
+      cn[i] = cn[i-1] / RealD(i);
+
+    ExpClover = Zero();
+    IdentityTimesC(ExpClover, cn[NMAX]);
+    for (int i=NMAX-1; i>=0; i--)
+      ExpClover = ExpClover * Clover + cn[i];
+
+    // prepare inverse
+    CloverInv = (-1.0)*Clover;
+
+    Clover = ExpClover * diag_mass;
+
+    ExpClover = Zero();
+    IdentityTimesC(ExpClover, cn[NMAX]);
+    for (int i=NMAX-1; i>=0; i--)
+      ExpClover = ExpClover * CloverInv + cn[i];
+
+    CloverInv = ExpClover * (1.0/diag_mass);
+
+  }
+
+  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
+    assert(0);
+    return lambda;
+  }
+
+};
+
+
+//////////////////////////////////
+// Compact Standard Clover
+//////////////////////////////////
+
+
+template<class Impl>
+class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
+                            public WilsonCloverHelpers<Impl> {
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  typedef WilsonCloverHelpers<Impl> Helpers;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  static void MassTerm(CloverField& Clover, RealD diag_mass) {
+    Clover += diag_mass;
+  }
+
+  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
+                          CloverTriangleField& Triangle,
+                          RealD csw_t, RealD diag_mass) {
+
+    // Do nothing
+  }
+
+  // TODO: implement Cmunu for better performances with compact layout, but don't do it
+  // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
+  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
+    return Helpers::Cmunu(U, lambda, mu, nu);
+  }
+};
+
+//////////////////////////////////
+// Compact Exp Clover
+//////////////////////////////////
+
+template<class Impl>
+class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  INHERIT_CLOVER_TYPES(Impl);
+  INHERIT_COMPACT_CLOVER_TYPES(Impl);
+
+  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
+
+  static void MassTerm(CloverField& Clover, RealD diag_mass) {
+    // do nothing!
+    // mass term is multiplied to exp(Clover) below
+  }
+
+  static int getNMAX(RealD prec, RealD R) {
+    /* compute stop condition for exponential */
+    int NMAX=1;
+    RealD cond=R*R/2.;
+
+    while (cond*std::exp(R)>prec) {
+      NMAX++;
+      cond*=R/(double)(NMAX+1);
+    }
+    return NMAX;
+  }
+
+  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
+  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
+
+  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
+
+  	  typedef iMatrix<ComplexD,6> mat;
+
+  	  RealD qn[6];
+  	  RealD qnold[6];
+  	  RealD p[5];
+  	  RealD trA2, trA3, trA4;
+
+  	  mat A2, A3, A4, A5;
+  	  A2 = alpha * alpha * arg * arg;
+  	  A3 = alpha * arg * A2;
+  	  A4 = A2 * A2;
+  	  A5 = A2 * A3;
+
+  	  trA2 = toReal( trace(A2) );
+  	  trA3 = toReal( trace(A3) );
+  	  trA4 = toReal( trace(A4));
+
+  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
+  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
+  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
+  	  p[3] = trA3 / 3.0;
+  	  p[4] = 0.5 * trA2;
+
+  	  qnold[0] = cN[Niter];
+  	  qnold[1] = 0.0;
+  	  qnold[2] = 0.0;
+  	  qnold[3] = 0.0;
+  	  qnold[4] = 0.0;
+  	  qnold[5] = 0.0;
+
+  	  for(int i = Niter-1; i >= 0; i--)
+  	  {
+  	   qn[0] = p[0] * qnold[5] + cN[i];
+  	   qn[1] = p[1] * qnold[5] + qnold[0];
+  	   qn[2] = p[2] * qnold[5] + qnold[1];
+  	   qn[3] = p[3] * qnold[5] + qnold[2];
+  	   qn[4] = p[4] * qnold[5] + qnold[3];
+  	   qn[5] = qnold[4];
+
+  	   qnold[0] = qn[0];
+  	   qnold[1] = qn[1];
+  	   qnold[2] = qn[2];
+  	   qnold[3] = qn[3];
+  	   qnold[4] = qn[4];
+  	   qnold[5] = qn[5];
+  	  }
+
+  	  mat unit(1.0);
+
+  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
+
+    }
+
+  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
+
+    GridBase* grid = Diagonal.Grid();
+    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
+
+    //
+    // Implementation completely in Daniel's layout
+    //
+
+    // Taylor expansion with Cayley-Hamilton recursion
+    // underlying Horner scheme as above
+    std::vector<RealD> cn(NMAX+1);
+    cn[0] = 1.0;
+    for (int i=1; i<=NMAX; i++){
+      cn[i] = cn[i-1] / RealD(i);
+    }
+
+      // Taken over from Daniel's implementation
+      conformable(Diagonal, Triangle);
+
+      long lsites = grid->lSites();
+    {
+      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
+      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
+      typedef iMatrix<ComplexD,6> mat;
+
+      autoView(diagonal_v,  Diagonal,  CpuRead);
+      autoView(triangle_v,  Triangle,  CpuRead);
+      autoView(diagonalExp_v, Diagonal, CpuWrite);
+      autoView(triangleExp_v, Triangle, CpuWrite);
+
+      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
+
+    	  mat srcCloverOpUL(0.0); // upper left block
+    	  mat srcCloverOpLR(0.0); // lower right block
+    	  mat ExpCloverOp;
+
+        scalar_object_diagonal diagonal_tmp     = Zero();
+        scalar_object_diagonal diagonal_exp_tmp = Zero();
+        scalar_object_triangle triangle_tmp     = Zero();
+        scalar_object_triangle triangle_exp_tmp = Zero();
+
+        Coordinate lcoor;
+        grid->LocalIndexToLocalCoor(site, lcoor);
+
+        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
+        peekLocalSite(triangle_tmp, triangle_v, lcoor);
+
+        int block;
+        block = 0;
+        for(int i = 0; i < 6; i++){
+        	for(int j = 0; j < 6; j++){
+        		if (i == j){
+        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
+        		}
+        		else{
+        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
+        		}
+        	}
+        }
+        block = 1;
+        for(int i = 0; i < 6; i++){
+          	for(int j = 0; j < 6; j++){
+           		if (i == j){
+           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
+           		}
+           		else{
+           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
+           		}
+            }
+        }
+
+        // exp(Clover)
+
+        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
+
+        block = 0;
+        for(int i = 0; i < 6; i++){
+        	for(int j = 0; j < 6; j++){
+            	if (i == j){
+            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
+            	}
+            	else if(i < j){
+            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
+            	}
+           	}
+        }
+
+        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
+
+        block = 1;
+        for(int i = 0; i < 6; i++){
+        	for(int j = 0; j < 6; j++){
+              	if (i == j){
+              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
+               	}
+               	else if(i < j){
+               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
+               	}
+            }
+        }
+
+        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
+        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
+      });
+    }
+
+    Diagonal *= diag_mass;
+    Triangle *= diag_mass;
+  }
+
+
+  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
+    assert(0);
+    return lambda;
+  }
+
+};
+
+
+NAMESPACE_END(Grid);
@@ -31,6 +31,7 @@

 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

@@ -85,7 +86,7 @@ NAMESPACE_BEGIN(Grid);
 //           + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts =  60 complex words per site
 //                                                                 =  84 complex words per site

-template<class Impl>
+template<class Impl, class CloverHelpers>
 class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
                                   public WilsonCloverHelpers<Impl>,
                                   public CompactWilsonCloverHelpers<Impl> {
@@ -138,38 +138,52 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;

 // Clover fermions
-typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
-typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
-typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
+template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
+template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;

-typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
-typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
-typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;

-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
+typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
+typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;

-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
+
+typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+
+typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;

 // Compact Clover fermions
-typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
-typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
-typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
+template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
+template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;

-typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
-typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
-typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
+typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
+typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
+typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;

-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
+typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
+typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
+typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;

-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
-typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
+typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
+typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
+typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
+
+typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
+typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
+typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
+
+typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;

 // Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
@@ -49,7 +49,7 @@ public:

  virtual FermionField &tmp(void) = 0;

-  virtual void DirichletBlock(Coordinate & _Block) { assert(0); };
+  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
  
  GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
  GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
@@ -32,6 +32,7 @@

 #include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
 #include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

@@ -51,7 +52,7 @@ NAMESPACE_BEGIN(Grid);
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////

-template <class Impl>
+template<class Impl, class CloverHelpers>
 class WilsonCloverFermion : public WilsonFermion<Impl>,
                            public WilsonCloverHelpers<Impl>
 {
@@ -209,6 +209,8 @@ public:
 };


+////////////////////////////////////////////////////////
+
 template<class Impl> class CompactWilsonCloverHelpers {
 public:

@@ -47,8 +47,6 @@ class CompactWilsonCloverTypes {
 public:
  INHERIT_IMPL_TYPES(Impl);

-  static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
-
  static constexpr int Nred      = Nc * Nhs;        // 6
  static constexpr int Nblock    = Nhs;             // 2
  static constexpr int Ndiagonal = Nred;            // 6
@@ -297,7 +297,7 @@ public:
  void ZeroCountersi(void)  {  }
  void Reporti(int calls)  {  }

-  std::vector<int> surface_list;
+  //  Vector<int> surface_list;

  WilsonStencil(GridBase *grid,
 		int npoints,
@@ -307,10 +307,11 @@ public:
    : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) 
  { 
    ZeroCountersi();
-    surface_list.resize(0);
+    //    surface_list.resize(0);
    this->same_node.resize(npoints);
  };

+  /*
  void BuildSurfaceList(int Ls,int vol4){

    // find same node for SHM
@@ -331,7 +332,8 @@ public:
      }
    }
  }
-
+  */
+  
  template < class compressor>
  void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
  {
@@ -178,16 +178,8 @@ public:
 		  GridRedBlackCartesian &FourDimRedBlackGrid,
 		  double _M5,const ImplParams &p= ImplParams());

-  virtual void DirichletBlock(Coordinate & block)
+  virtual void DirichletBlock(const Coordinate & block)
  {
-    assert(block.size()==Nd+1);
-    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
-      Dirichlet = 1;
-      Block = block;
-      Stencil.DirichletBlock(block); 
-      StencilEven.DirichletBlock(block); 
-      StencilOdd.DirichletBlock(block);
-    }
  }
  // Constructors
  /*
@@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 			FiveDimRedBlackGrid,
 			FourDimGrid,
 			FourDimRedBlackGrid,_M5,p),
-  mass(_mass)
+  mass_plus(_mass), mass_minus(_mass)
 { 
 }

@@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  Vector<Coeff_t> diag (Ls,1.0);
-  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
+  Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  Vector<Coeff_t> diag = bs;
  Vector<Coeff_t> upper= cs;
  Vector<Coeff_t> lower= cs; 
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
 // FIXME Redunant with the above routine; check this and eliminate
@@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &
    upper[i]=-cee[i];
    lower[i]=-cee[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  upper[Ls-1]=-mass_minus*upper[Ls-1];
+  lower[0]   =-mass_plus*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
@@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
-      lower[s] = mass*cee[Ls-1];
+      lower[s] = mass_minus*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
-      upper[s] = mass*cee[0];
+      upper[s] = mass_plus*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
@@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
  Vector<Coeff_t> diag(Ls,1.0);
  Vector<Coeff_t> upper(Ls,-1.0);
  Vector<Coeff_t> lower(Ls,-1.0);
-  upper[Ls-1]=-mass*upper[Ls-1];
-  lower[0]   =-mass*lower[0];
+  upper[Ls-1]=-mass_plus*upper[Ls-1];
+  lower[0]   =-mass_minus*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }

@@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
  for (int s=0;s<Ls;s++){
    if ( s== 0 ) {
      upper[s] = cs[s+1];
-      lower[s] =-mass*cs[Ls-1];
+      lower[s] =-mass_minus*cs[Ls-1];
    } else if ( s==(Ls-1) ) { 
-      upper[s] =-mass*cs[0];
+      upper[s] =-mass_plus*cs[0];
      lower[s] = cs[s-1];
    } else { 
      upper[s] = cs[s+1];
@@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      
-      leem[i]=mass*cee[Ls-1]/bee[0];
+      leem[i]=mass_minus*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
@@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
      
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
      
-      ueem[i]=mass;
+      ueem[i]=mass_plus;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
      
@@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  }
 	
  { 
-    Coeff_t delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass_minus*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
@@ -642,6 +642,10 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      Current curr_type,
 						      unsigned int mu)
 {
+
+  assert(mass_plus == mass_minus);
+  RealD mass = mass_plus;
+  
 #if (!defined(GRID_HIP))
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -777,6 +781,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);

+  assert(mass_plus == mass_minus);
+  RealD mass = mass_plus;

 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
@@ -66,18 +66,17 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  M5Dcalls++;
  M5Dtime-=usecond();

-  uint64_t nloop = grid->oSites()/Ls;
+  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss= sss*Ls;
+    uint64_t s = sss%Ls;
+    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1, tmp2;
-    for(int s=0;s<Ls;s++){
-      uint64_t idx_u = ss+((s+1)%Ls);
-      uint64_t idx_l = ss+((s+Ls-1)%Ls);
-      spProj5m(tmp1,psi(idx_u));
-      spProj5p(tmp2,psi(idx_l));
-      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
-    }
+    uint64_t idx_u = ss+((s+1)%Ls);
+    uint64_t idx_l = ss+((s+Ls-1)%Ls);
+    spProj5m(tmp1,psi(idx_u));
+    spProj5p(tmp2,psi(idx_l));
+    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
  });
  M5Dtime+=usecond();
 }
@@ -108,18 +107,17 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  M5Dcalls++;
  M5Dtime-=usecond();

-  uint64_t nloop = grid->oSites()/Ls;
+  uint64_t nloop = grid->oSites();
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    uint64_t ss=sss*Ls;
+    uint64_t s = sss%Ls;
+    uint64_t ss= sss-s;
    typedef decltype(coalescedRead(psi[0])) spinor;
    spinor tmp1,tmp2;
-    for(int s=0;s<Ls;s++){
-      uint64_t idx_u = ss+((s+1)%Ls);
-      uint64_t idx_l = ss+((s+Ls-1)%Ls);
-      spProj5p(tmp1,psi(idx_u));
-      spProj5m(tmp2,psi(idx_l));
-      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
-    }
+    uint64_t idx_u = ss+((s+1)%Ls);
+    uint64_t idx_l = ss+((s+Ls-1)%Ls);
+    spProj5p(tmp1,psi(idx_u));
+    spProj5m(tmp2,psi(idx_l));
+    coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
  });
  M5Dtime+=usecond();
 }
@@ -32,17 +32,18 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>

+
 NAMESPACE_BEGIN(Grid);
-template<class Impl>
-CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
-                                                             GridCartesian& Fgrid,
-                                                             GridRedBlackCartesian& Hgrid,
-                                                             const RealD _mass,
-                                                             const RealD _csw_r,
-                                                             const RealD _csw_t,
-                                                             const RealD _cF,
-                                                             const WilsonAnisotropyCoefficients& clover_anisotropy,
-                                                             const ImplParams& impl_p)
+template<class Impl, class CloverHelpers>
+CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
+                                                                            GridCartesian& Fgrid,
+                                                                            GridRedBlackCartesian& Hgrid,
+                                                                            const RealD _mass,
+                                                                            const RealD _csw_r,
+                                                                            const RealD _csw_t,
+                                                                            const RealD _cF,
+                                                                            const WilsonAnisotropyCoefficients& clover_anisotropy,
+                                                                            const ImplParams& impl_p)
  : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
  , csw_r(_csw_r)
  , csw_t(_csw_t)
@@ -58,50 +59,55 @@ CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
  , BoundaryMask(&Fgrid)
  , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
+  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+
  csw_r *= 0.5;
  csw_t *= 0.5;
  if (clover_anisotropy.isAnisotropic)
    csw_r /= clover_anisotropy.xi_0;

  ImportGauge(_Umu);
-  if (open_boundaries)
+  if (open_boundaries) {
+    this->BoundaryMaskEven.Checkerboard() = Even;
+    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
+  }
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
  if(this->open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
  if(this->open_boundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
@@ -109,8 +115,8 @@ void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& o
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
  out.Checkerboard() = in.Checkerboard();
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
@@ -118,20 +124,20 @@ void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
@@ -144,13 +150,13 @@ void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionFiel
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
  Mooee(in, out); // blocks are hermitian
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
  if(in.Grid()->_isCheckerBoarded) {
    if(in.Checkerboard() == Odd) {
      MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
@@ -163,23 +169,23 @@ void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionF
  if(open_boundaries) ApplyBoundaryMask(out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
  MooeeInv(in, out); // blocks are hermitian
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
  DhopDirAll(in, out);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
  assert(!open_boundaries); // TODO check for changes required for open bc

  // NOTE: code copied from original clover term
@@ -251,7 +257,7 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);   // checked
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);   // checked
      count++;
    }

@@ -261,18 +267,18 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
  force += clover_force;
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
  assert(0);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&        in,
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField&        in,
                    FermionField&              out,
                    const CloverDiagonalField& diagonal,
                    const CloverTriangleField& triangle) {
@@ -285,8 +291,8 @@ void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&
  CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
 }

-template<class Impl>
-void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
+template<class Impl, class CloverHelpers>
+void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
  // NOTE: parts copied from original implementation

  // Import gauge into base class
@@ -318,22 +324,27 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
-  TmpOriginal += this->diag_mass;
-
+  // Handle mass term based on clover policy
+  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
+  
  // Convert the data layout of the clover term
  double t4 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);

-  // Possible modify the boundary values
+  // Exponentiate the clover (nothing happens in case of the standard clover)
  double t5 = usecond();
+  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
+
+  // Possible modify the boundary values
+  double t6 = usecond();
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);

-  // Invert the clover term in the improved layout
-  double t6 = usecond();
+  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
+  double t7 = usecond();
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);

  // Fill the remaining clover fields
-  double t7 = usecond();
+  double t8 = usecond();
  pickCheckerboard(Even, DiagonalEven,    Diagonal);
  pickCheckerboard(Even, TriangleEven,    Triangle);
  pickCheckerboard(Odd,  DiagonalOdd,     Diagonal);
@@ -344,20 +355,19 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
  pickCheckerboard(Odd,  TriangleInvOdd,  TriangleInv);

  // Report timings
-  double t8 = usecond();
-#if 0
-  std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
-            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
-            << ", allocations = "               << (t2 - t1) / 1e6
-            << ", field strength = "            << (t3 - t2) / 1e6
-            << ", fill clover = "               << (t4 - t3) / 1e6
-            << ", convert = "                   << (t5 - t4) / 1e6
-            << ", boundaries = "                << (t6 - t5) / 1e6
-            << ", inversions = "                << (t7 - t6) / 1e6
-            << ", pick cbs = "                  << (t8 - t7) / 1e6
-            << ", total = "                     << (t8 - t0) / 1e6
-            << std::endl;
-#endif
+  double t9 = usecond();
+
+  std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }

 NAMESPACE_END(Grid);
@@ -34,8 +34,8 @@

 NAMESPACE_BEGIN(Grid);

-template<class Impl>
-WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&                         _Umu,
+template<class Impl, class CloverHelpers>
+WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&                         _Umu,
                                               GridCartesian&                      Fgrid,
                                               GridRedBlackCartesian&              Hgrid,
                                               const RealD                         _mass,
@@ -74,8 +74,8 @@ WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&
 }

 // *NOT* EO
-template <class Impl>
-void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());

@@ -89,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
  out += temp;
 }

-template <class Impl>
-void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());

@@ -104,8 +104,8 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
  out += temp;
 }

-template <class Impl>
-void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
 {
  double t0 = usecond();
  WilsonFermion<Impl>::ImportGauge(_Umu);
@@ -131,47 +131,11 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
  CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
  CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
-  CloverTerm += diag_mass;
-
+   
  double t4 = usecond();
-  int lvol = _Umu.Grid()->lSites();
-  int DimRep = Impl::Dimension;
+  CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);

  double t5 = usecond();
-  {
-    autoView(CTv,CloverTerm,CpuRead);
-    autoView(CTIv,CloverTermInv,CpuWrite);
-    thread_for(site, lvol, {
-      Coordinate lcoor;
-      grid->LocalIndexToLocalCoor(site, lcoor);
-      Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-      Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
-      typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
-      peekLocalSite(Qx, CTv, lcoor);
-      //if (csw!=0){
-      for (int j = 0; j < Ns; j++)
-	for (int k = 0; k < Ns; k++)
-	  for (int a = 0; a < DimRep; a++)
-	    for (int b = 0; b < DimRep; b++){
-	      auto zz =  Qx()(j, k)(a, b);
-	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
-	    }
-      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
-      
-      EigenInvCloverOp = EigenCloverOp.inverse();
-      //std::cout << EigenInvCloverOp << std::endl;
-      for (int j = 0; j < Ns; j++)
-	for (int k = 0; k < Ns; k++)
-	  for (int a = 0; a < DimRep; a++)
-	    for (int b = 0; b < DimRep; b++)
-	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
-      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
-      //  }
-      pokeLocalSite(Qxinv, CTIv, lcoor);
-    });
-  }
-
-  double t6 = usecond();
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@@ -184,48 +148,44 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)

  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
-  double t7 = usecond();
+  double t6 = usecond();

-#if 0
-  std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
-            << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
-            << ", allocations = "               << (t2 - t1) / 1e6
-            << ", field strength = "            << (t3 - t2) / 1e6
-            << ", fill clover = "               << (t4 - t3) / 1e6
-            << ", misc = "                      << (t5 - t4) / 1e6
-            << ", inversions = "                << (t6 - t5) / 1e6
-            << ", pick cbs = "                  << (t7 - t6) / 1e6
-            << ", total = "                     << (t7 - t0) / 1e6
-            << std::endl;
-#endif
+  std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
+  std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiation =              " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "pick cbs =                   " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "total =                      " << (t6 - t0) / 1e6 << std::endl;
 }

-template <class Impl>
-void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }

-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }

-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }

-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }

-template <class Impl>
-void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.Checkerboard() = in.Checkerboard();
  CloverField *Clover;
@@ -278,8 +238,8 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
 } // MooeeInternal

 // Derivative parts unpreconditioned pseudofermions
-template <class Impl>
-void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 {
  conformable(X.Grid(), Y.Grid());
  conformable(X.Grid(), force.Grid());
@@ -349,7 +309,7 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
      }
      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu);                   // checked
+      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
      count++;
    }

@@ -360,15 +320,15 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
 }

 // Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }

 // Derivative parts
-template <class Impl>
-void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+template<class Impl, class CloverHelpers>
+void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
@@ -92,6 +92,19 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
  }

+  if ( p.dirichlet.size() == Nd+1) {
+    Coordinate block = p.dirichlet;
+    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
+      Dirichlet = 1;
+      Block = block;
+    }
+  } else {
+    Coordinate block(Nd+1,0);
+    Block = block;
+  }
+
+  ZeroCounters();
+
  if (Impl::LsVectorised) { 

    int nsimd = Simd::Nsimd();
@@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid

 Source file: ./lib/qcd/action/fermion/WilsonFermion.cc

-Copyright (C) 2015
+Copyright (C) 2022

 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Fabian Joswig <fabian.joswig@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -599,11 +600,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
+  if(curr_type != Current::Vector)
+  {
+    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
+    exit(1);
+  }
+
  Gamma g5(Gamma::Algebra::Gamma5);
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
+
+  PropagatorField tmp_shifted(UGrid);
+  PropagatorField g5Lg5(UGrid);
+  PropagatorField R(UGrid);
+  PropagatorField gmuR(UGrid);
+
+    Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT,
+  };
+  Gamma gmu=Gamma(Gmu[mu]);
+
+  g5Lg5=g5*q_in_1*g5;
+  tmp_shifted=Cshift(q_in_2,mu,1);
+  Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
+  gmuR=gmu*R;
+
+  q_out=adj(g5Lg5)*R;
+  q_out-=adj(g5Lg5)*gmuR;
+
+  tmp_shifted=Cshift(q_in_1,mu,1);
+  Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
+  g5Lg5=g5*g5Lg5*g5;
+  R=q_in_2;
+  gmuR=gmu*R;
+
+  q_out-=adj(g5Lg5)*R;
+  q_out-=adj(g5Lg5)*gmuR;
 }


@@ -617,9 +654,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                              unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
+  if(curr_type != Current::Vector)
+  {
+    std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
+    exit(1);
+  }
+
+  int tshift = (mu == Nd-1) ? 1 : 0;
+  unsigned int LLt    = GridDefaultLatt()[Tp];
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-  assert(0);
+  auto UGrid= this->GaugeGrid();
+
+  PropagatorField tmp(UGrid);
+  PropagatorField Utmp(UGrid);
+  PropagatorField L(UGrid);
+  PropagatorField zz (UGrid);
+  zz=Zero();
+  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
+
+    Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT,
+  };
+  Gamma gmu=Gamma(Gmu[mu]);
+
+  tmp = Cshift(q_in,mu,1);
+  Impl::multLinkField(Utmp,this->Umu,tmp,mu);
+  tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
+  tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
+  q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
+
+  tmp = q_in *lattice_cmplx;
+  tmp = Cshift(tmp,mu,-1);
+  Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
+  tmp = -( Utmp + gmu*Utmp );
+  // Mask the time
+  if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
+    unsigned int t0 = 0;
+    tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
+  } else {
+    tmp = where((lcoor>=tmin+tshift),tmp,zz);
+  }
+  q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 }

 NAMESPACE_END(Grid);
@@ -440,6 +440,17 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S

 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

+#define KERNEL_CALL_EXT(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  const uint64_t    sz = st.surface_list.size();			\
+  auto ptr = &st.surface_list[0];					\
+  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
+      int sF = ptr[ss];							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+    });									\
+  accelerator_barrier();
+
 #define ASM_CALL(A)							\
  thread_for( ss, Nsite, {						\
    int sU = ss;							\
@@ -9,6 +9,7 @@
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: Daniel Richtmann <daniel.richtmann@gmail.com>
+    Author: Mattia Bruno <mattia.bruno@cern.ch>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -32,10 +33,12 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

 #include "impl.h"
-template class CompactWilsonCloverFermion<IMPLEMENTATION>; 
+template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>; 
+template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>; 

 NAMESPACE_END(Grid);
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -8,7 +8,8 @@

    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
+    Author: Mattia Bruno <mattia.bruno@cern.ch>
+    
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@@ -31,10 +32,12 @@
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
+#include <Grid/qcd/action/fermion/CloverHelpers.h>

 NAMESPACE_BEGIN(Grid);

 #include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION>; 
+template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>; 
+template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>; 

 NAMESPACE_END(Grid);
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -1,51 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015, 2020
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-
-#ifndef AVX512
-#ifndef QPX
-#ifndef A64FX
-#ifndef A64FXFIXEDSIZE
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#endif
-#endif
-#endif
-#endif
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonKernels<IMPLEMENTATION>;
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonKernelsInstantiation.cc.master
@@ -18,6 +18,10 @@ WILSON_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "

+COMPACT_WILSON_IMPL_LIST=" \
+	   WilsonImplF \
+	   WilsonImplD "
+
 DWF_IMPL_LIST=" \
 	   WilsonImplF \
 	   WilsonImplD \
@@ -40,13 +44,23 @@ EOF

 done

-CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
+CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"

 for impl in $WILSON_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+done
+done
+
+CC_LIST="CompactWilsonCloverFermionInstantiation"
+
+for impl in $COMPACT_WILSON_IMPL_LIST
+do
+for f in $CC_LIST
+do
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done

@@ -63,14 +77,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done

 # overwrite the .cc file in Gparity directories
 for impl in $GDWF_IMPL_LIST
 do
-  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc 
+  ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
 done


@@ -84,7 +98,7 @@ for impl in $STAG_IMPL_LIST
 do
 for f in $CC_LIST
 do
-  ln -f -s ../$f.cc.master $impl/$f$impl.cc 
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
 done
 done

@@ -53,9 +53,9 @@ struct DirichletFilter: public MomentumFilterBase<MomentaField>
    LatticeInteger coor(grid); 
    LatCM zz(grid); zz = Zero();
    for(int mu=0;mu<Nd;mu++) {
-      if ( (Block[mu]) && (Block[mu] < grid->GlobalDimensions()[mu] ) ) {
+      if ( (Block[mu]) && (Block[mu] <= grid->GlobalDimensions()[mu] ) ) {
 	// If costly could provide Grid earlier and precompute masks
-	std::cout << " Dirichlet in mu="<<mu<<std::endl;
+	std::cout << GridLogMessage << " Dirichlet in mu="<<mu<<std::endl;
 	LatticeCoordinate(coor,mu);
 	auto P_mu = PeekIndex<LorentzIndex>(P, mu);
 	P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
@@ -75,16 +75,14 @@ NAMESPACE_BEGIN(Grid);
 	remez.generateApprox(param.degree,1,2);
 	PowerHalf.Init(remez,param.tolerance,false);
 	PowerNegHalf.Init(remez,param.tolerance,true);
+	MDPowerNegHalf.Init(remez,param.mdtolerance,true);

 	// MdagM^(+- 1/4)
 	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
 	remez.generateApprox(param.degree,1,4);
   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-
-	// Derive solves different tol
   	MDPowerQuarter.Init(remez,param.mdtolerance,false);
-	MDPowerNegHalf.Init(remez,param.mdtolerance,true);
+	PowerNegQuarter.Init(remez,param.tolerance,true);
      };

      virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
@@ -145,7 +145,7 @@ protected:

      MomFilter->applyFilter(force);
      std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl;
-      //      DumpSliceNorm("force ",force,Nd-1);
+      DumpSliceNorm("force ",force,Nd-1);
      
      Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x]) 
      Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;    
@@ -99,7 +99,7 @@ public:
 	// using wilson flow by default here
 	WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 	WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
-	Real T0   = WF.energyDensityPlaquette(Usmear);
+	Real T0   = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
 	std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 		  << "T0                : [ " << traj << " ] "<< T0 << std::endl;
      }
@@ -7,6 +7,7 @@ Source file: ./lib/qcd/modules/plaquette.h
 Copyright (C) 2017

 Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Christopher Kelly <ckelly@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,28 +34,44 @@ NAMESPACE_BEGIN(Grid);

 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
+public:
+  //Store generic measurements to take during smearing process using std::function
+  typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
+  
+private:
  unsigned int Nstep;
-  unsigned int measure_interval;
-  mutable RealD epsilon, taus;
-
+  RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step
+ 
+  std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency

  mutable WilsonGaugeAction<Gimpl> SG;

-  void evolve_step(typename Gimpl::GaugeField&) const;
-  void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
-  RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
+  //Evolve the gauge field by 1 step and update tau
+  void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
+  //Evolve the gauge field by 1 step and update tau and the current time step eps
+  void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const;

 public:
  INHERIT_GIMPL_TYPES(Gimpl)

+  void resetActions(){ functions.clear(); }
+
+  void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
+
+  //Set the class to perform the default measurements: 
+  //the plaquette energy density every step
+  //the plaquette topological charge every 'topq_meas_interval' steps
+  //and output to stdout
+  void setDefaultMeasurements(int topq_meas_interval = 1);
+
  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
  Nstep(Nstep),
    epsilon(epsilon),
-    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
    assert(epsilon > 0.0);
    LogMessage();
+    setDefaultMeasurements(interval);
  }

  void LogMessage() {
@@ -73,9 +90,29 @@ public:
    // undefined for WilsonFlow
  }

-  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
-  RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
-  RealD energyDensityPlaquette(const GaugeField& U) const;
+  void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const;
+
+  //Compute t^2 <E(t)> for time t from the plaquette
+  static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
+
+  //Compute t^2 <E(t)> for time t from the 1x1 cloverleaf form
+  //t is the Wilson flow time
+  static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U);
+  
+  //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps
+  //The smeared field is output as V
+  std::vector<RealD> flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1);
+
+  //Version that does not return the smeared field
+  std::vector<RealD> flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1);
+
+
+  //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps
+  //The smeared field is output as V
+  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1);
+
+  //Version that does not return the smeared field
+  std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
 };


@@ -83,7 +120,7 @@ public:
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
+void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
  GaugeField Z(U.Grid());
  GaugeField tmp(U.Grid());
  SG.deriv(U, Z);
@@ -99,12 +136,13 @@ void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  tau += epsilon;
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
-  if (maxTau - taus < epsilon){
-    epsilon = maxTau-taus;
+void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{
+  if (maxTau - tau < eps){
+    eps = maxTau-tau;
  }
  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
@@ -114,95 +152,151 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
+  Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0

  Z *= -17.0/8.0;
  SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
  Zprime += 2.0*tmp;
  Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
+  Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
    

  Z *= -4.0/3.0;
  SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
-  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
+  Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2

  // Ramos 
-  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
+  Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
  RealD diff = norm2(diffU);
  // adjust integration step
    
-  taus += epsilon;
+  tau += eps;
  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    
-  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
+  eps = eps*0.95*std::pow(1e-4/diff,1./3.);
  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;

 }

+
 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
-  RealD td = tau(step);
-  return 2.0 * td * td * SG.S(U)/U.Grid()->gSites();
+RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
+  static WilsonGaugeAction<Gimpl> SG(3.0);
+  return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
+}
+
+//Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
+template <class Gimpl>
+RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  assert(Nd == 4);
+  //E = 1/2 tr( F_munu F_munu )
+  //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
+  //F_01 F_02 F_03   F_12 F_13  F_23
+  GaugeMat F(U.Grid());
+  LatticeComplexD R(U.Grid());
+  R = Zero();
+  
+  for(int mu=0;mu<3;mu++){
+    for(int nu=mu+1;nu<4;nu++){
+      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
+      R = R + trace(F*F);
+    }
+  }
+  ComplexD out = sum(R);
+  out = t*t*out / RealD(U.Grid()->gSites());
+  return -real(out); //minus sign necessary for +ve energy
+}
+
+
+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
+  std::vector<RealD> out;
+  resetActions();
+  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
+      std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl;
+      out.push_back( energyDensityPlaquette(t,U) );
+    });      
+  smear(V,U);
+  return out;
 }

 template <class Gimpl>
-RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
-  return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites();
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
+  GaugeField V(U);
+  return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
 }

+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
+  std::vector<RealD> out;
+  resetActions();
+  addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ 
+      std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl;
+      out.push_back( energyDensityCloverleaf(t,U) );
+    });      
+  smear(V,U);
+  return out;
+}
+
+template <class Gimpl>
+std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
+  GaugeField V(U);
+  return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
+}
+
+

 //#define WF_TIMING 
-
-
-
 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
+void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
  out = in;
-  for (unsigned int step = 1; step <= Nstep; step++) {
+  RealD taus = 0.;
+  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
-    evolve_step(out);
+    evolve_step(out, taus);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> diff = end - start;
 #ifdef WF_TIMING
    std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << tau(step) << "  " 
-	      << energyDensityPlaquette(step,out) << std::endl;
-    if( step % measure_interval == 0){
-      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-		<< step << "  " 
-		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-    }
+    //Perform measurements
+    for(auto const &meas : functions)
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
  }
 }

 template <class Gimpl>
-void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
+void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{
  out = in;
-  taus = epsilon;
+  RealD taus = 0.;
+  RealD eps = epsilon;
  unsigned int step = 0;
  do{
    step++;
    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
-    evolve_step_adaptive(out, maxTau);
-    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-		  << step << "  " << taus << "  "
-	      << energyDensityPlaquette(out) << std::endl;
-    if( step % measure_interval == 0){
-      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
-		<< step << "  " 
-		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
-    }
+    evolve_step_adaptive(out, taus, eps, maxTau);
+    //Perform measurements
+    for(auto const &meas : functions)
+      if( step % meas.first == 0 ) meas.second(step,taus,out);
  } while (taus < maxTau);
-
-
-
 }

+template <class Gimpl>
+void WilsonFlow<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
+  addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "  << step << "  " << t << "  " << energyDensityPlaquette(t,U) << std::endl;
+    });
+  addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
+      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "  << step << "  " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
+    });
+}
+
+
 NAMESPACE_END(Grid);

@@ -72,14 +72,13 @@ public:

  //Fix the gauge field Umu
  //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
-    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
+    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
  }
-
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
  //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform, Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {

    GridBase *grid = Umu.Grid();

@@ -141,7 +140,9 @@ public:

      }
    }
-    assert(0 && "Gauge fixing did not converge within the specified number of iterations");
+    std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
+    if (err_on_no_converge)
+      assert(0 && "Gauge fixing did not converge within the specified number of iterations");
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
    GridBase *grid = U[0].Grid();
@@ -215,7 +215,7 @@ public:

    double vol = Umu.Grid()->gSites();

-    return p.real() / vol / 4.0 / 3.0;
+    return p.real() / vol / (4.0 * Nc ) ;
  };

  //////////////////////////////////////////////////
@@ -52,6 +52,11 @@ public:
    return arg;
  }
 };
+class SimpleStencilParams{
+public:
+  Coordinate dirichlet;
+  SimpleStencilParams() {};
+};

 NAMESPACE_END(Grid);

@@ -131,7 +131,6 @@ class CartesianStencilAccelerator {
  int           _checkerboard;
  int           _npoints; // Move to template param?
  int           _osites;
-  int           _dirichlet;
  StencilVector _directions;
  StencilVector _distances;
  StencilVector _comms_send;
@@ -503,7 +502,6 @@ public:
  }
  void AddCopy(void *from,void * to, Integer bytes)
  {
-    //    std::cout << "Adding CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<bytes<<std::endl;
    CopyReceiveBuffer obj;
    obj.from_p = from;
    obj.to_p = to;
@@ -517,7 +515,7 @@ public:
      cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
      cobj *to  =(cobj *)CopyReceiveBuffers[i].to_p;
      Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
-      //    std::cout << "CopyReceiveBuffer "<<std::hex<<from<<" "<<to<<std::dec<<" "<<words*sizeof(cobj)<<std::endl;
+
      accelerator_forNB(j, words, cobj::Nsimd(), {
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
@@ -543,13 +541,12 @@ public:
 	   &&(CachedTransfers[i].lane       ==lane)
 	   &&(CachedTransfers[i].cb         ==cb)
 	     ){
-	//	std::cout << "Found duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<< " bytes "<<bytes <<std::endl;
+
 	AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
 	return 1;
      }
    }

-    //    std::cout << "No duplicate plane dir "<<direction<<" plane "<< OrthogPlane<< " simd "<<lane << " relproc "<<DestProc<<"  bytes "<<bytes<<std::endl;
    CachedTransfers.push_back(obj);
    return 0;
  }
@@ -643,23 +640,23 @@ public:
 	}
      }
      if(local == 0) {
-	surface_list.push_back(site);
+	for(int s=0;s<Ls;s++){
+	  surface_list.push_back(site*Ls+s);
+	}
      }
    }
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
  {
-    this->_dirichlet = 1;
    for(int ii=0;ii<this->_npoints;ii++){
      int dimension    = this->_directions[ii];
      int displacement = this->_distances[ii];
-      int shift = displacement;
      int gd = _grid->_gdimensions[dimension];
      int fd = _grid->_fdimensions[dimension];
      int pd = _grid->_processors [dimension];
-      int ld = gd/pd;
      int pc = _grid->_processor_coor[dimension];
+      int ld = fd/pd;
      ///////////////////////////////////////////
      // Figure out dirichlet send and receive
      // on this leg of stencil.
@@ -668,25 +665,25 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      if ( block ) {
+      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
-      
+	// Quiesce communication across block boundaries
 	if( displacement > 0 ) {
 	  // High side, low side
 	  // | <--B--->|
 	  // |    |    |
 	  //           noR
 	  // noS
-	  if ( (ld*(pc+1) ) % block == 0 ) this->_comms_recv[ii] = 0;
-	  if ( ( ld*pc ) % block == 0    ) this->_comms_send[ii] = 0;
+	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_recv[ii] = 0;
+	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	} else {
 	  // High side, low side
 	  // | <--B--->|
 	  // |    |    |
 	  //           noS
 	  // noR
-	  if ( (ld*(pc+1) ) % block == 0 ) this->_comms_send[ii] = 0;
-	  if ( ( ld*pc ) % block    == 0 ) this->_comms_recv[ii] = 0;
+	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
+	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
      }
    }
@@ -698,7 +695,6 @@ public:
 		   const std::vector<int> &distances,
 		   Parameters p)
  {
-    this->_dirichlet = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@@ -715,6 +711,8 @@ public:
    this->_comms_recv.resize(npoints); 
    this->same_node.resize(npoints);

+    if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up
+
    _unified_buffer_size=0;
    surface_list.resize(0);

@@ -734,7 +732,7 @@ public:
      int gd = _grid->_gdimensions[dimension];
      int fd = _grid->_fdimensions[dimension];
      int pd = _grid->_processors [dimension];
-      int ld = gd/pd;
+      //      int ld = gd/pd;
      int rd = _grid->_rdimensions[dimension];
      int pc = _grid->_processor_coor[dimension];
      this->_permute_type[point]=_grid->PermuteType(dimension);
@@ -746,9 +744,6 @@ public:
      int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
      int rotate_dim      = _grid->_simd_layout[dimension]>2;

-      this->_comms_send[ii] = comm_dim;
-      this->_comms_recv[ii] = comm_dim;
-
      assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported

      int sshift[2];
@@ -878,12 +873,14 @@ public:
    for(int x=0;x<rd;x++){

      int permute_type=grid->PermuteType(dimension);
+      int permute_slice;

      int sx        =  (x+sshift)%rd;

      int offnode = 0;
      if ( simd_layout > 1 ) {

+	permute_slice=1;
 	for(int i=0;i<Nsimd;i++){

 	  int inner_bit = (Nsimd>>(permute_type+1));
@@ -900,6 +897,7 @@ public:
      } else {
 	int comm_proc = ((x+sshift)/rd)%pd;
 	offnode = (comm_proc!= 0);
+	permute_slice=0;
      }

      int wraparound=0;
@@ -911,25 +909,29 @@ public:
      }

      // Wrap locally dirichlet support case OR node local
-      if ( (offnode==0) || (comms_recv==0)  ) {
+      if ( offnode==0 ) {

-	int permute_slice=0;
+	permute_slice=0;
 	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
-
+	
      } else {

+	if ( comms_recv ) {
+
+	  ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase
+
+	} else { 
+
+	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
+
+	}
+
+      }
+      
+      if ( offnode ) {
 	int words = buffer_size;
 	if (cbmask != 0x3) words=words>>1;
-
-	//	int rank           = grid->_processor;
-	//	int recv_from_rank;
-	//	int xmit_to_rank;
-
-	int unified_buffer_offset = _unified_buffer_size;
 	_unified_buffer_size    += words;
-
-	ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
-
      }
    }
  }
@@ -1060,8 +1062,6 @@ public:
      int comm_proc = ((x+sshift)/rd)%pd;
      
      if (comm_proc) {
-
-	
 	
 	int words = buffer_size;
 	if (cbmask != 0x3) words=words>>1;
@@ -1069,64 +1069,70 @@ public:
 	int bytes =  words * compress.CommDatumSize();

 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
-	if ( !face_table_computed ) {
-	  face_table.resize(face_idx+1);
-	  std::vector<std::pair<int,int> >  face_table_host ;
-	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
-	  face_table[face_idx].resize(face_table_host.size());
-	  acceleratorCopyToDevice(&face_table_host[0],
-				  &face_table[face_idx][0],
-				  face_table[face_idx].size()*sizeof(face_table_host[0]));
-	}
+	int comm_off = u_comm_offset;

-	//      	int rank           = _grid->_processor;
 	int recv_from_rank;
 	int xmit_to_rank;
+	cobj *recv_buf;
+	cobj *send_buf;
 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);

 	assert (xmit_to_rank   != _grid->ThisRank());
 	assert (recv_from_rank != _grid->ThisRank());

-	cobj *recv_buf;
-	if ( compress.DecompressionStep() ) {
-	  recv_buf=u_simd_recv_buf[0];
-	} else {
-	  recv_buf=this->u_recv_buf_p;
+	if( comms_send ) {
+
+	  if ( !face_table_computed ) {
+	    face_table.resize(face_idx+1);
+	    std::vector<std::pair<int,int> >  face_table_host ;
+	    Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
+	    face_table[face_idx].resize(face_table_host.size());
+	    acceleratorCopyToDevice(&face_table_host[0],
+				    &face_table[face_idx][0],
+				    face_table[face_idx].size()*sizeof(face_table_host[0]));
+	  }
+
+
+	  if ( compress.DecompressionStep() ) {
+	    recv_buf=u_simd_recv_buf[0];
+	  } else {
+	    recv_buf=this->u_recv_buf_p;
+	  }
+
+	  send_buf = this->u_send_buf_p; // Gather locally, must send
+	
+	  ////////////////////////////////////////////////////////
+	  // Gather locally
+	  ////////////////////////////////////////////////////////
+	  assert(send_buf!=NULL);
+
+	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
 	}

-
-	cobj *send_buf;
-	send_buf = this->u_send_buf_p; // Gather locally, must send
-	
-	////////////////////////////////////////////////////////
-	// Gather locally
-	////////////////////////////////////////////////////////
-	assert(send_buf!=NULL);
-	if ( comms_send ) 
-	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);
-	face_idx++;
-
-	int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask);
+	int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
 	if ( (!duplicate) ) { // Force comms for now

 	  ///////////////////////////////////////////////////////////
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  AddPacket((void *)&send_buf[u_comm_offset],
-		    (void *)&recv_buf[u_comm_offset],
+	  AddPacket((void *)&send_buf[comm_off],
+		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, comms_send,
 		    recv_from_rank, comms_recv,
 		    bytes);
 	}
 	
-	if ( compress.DecompressionStep()  ) {
-	  AddDecompress(&this->u_recv_buf_p[u_comm_offset],
-			&recv_buf[u_comm_offset],
+	if ( compress.DecompressionStep()  && comms_recv ) {
+	  AddDecompress(&this->u_recv_buf_p[comm_off],
+			&recv_buf[comm_off],
 			words,Decompressions);
 	}
+	
 	u_comm_offset+=words;
-	}
+	face_idx++;
+
+      }
    }
    return 0;
  }
@@ -1155,7 +1161,6 @@ public:


    int permute_type=_grid->PermuteType(dimension);
-    //    std::cout << "SimdNew permute type "<<permute_type<<std::endl;

    ///////////////////////////////////////////////
    // Simd direction uses an extract/merge pair
@@ -1189,8 +1194,9 @@ public:

      if ( any_offnode ) {

+	int comm_off = u_comm_offset;
 	for(int i=0;i<maxl;i++){
-	  spointers[i] = (cobj *) &u_simd_send_buf[i][u_comm_offset];
+	  spointers[i] = (cobj *) &u_simd_send_buf[i][comm_off];
 	}

 	int sx   = (x+sshift)%rd;
@@ -1199,15 +1205,15 @@ public:
 	  face_table.resize(face_idx+1);
 	  std::vector<std::pair<int,int> >  face_table_host ;
 				
-	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
+	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
 	  face_table[face_idx].resize(face_table_host.size());
 	  acceleratorCopyToDevice(&face_table_host[0],
 				  &face_table[face_idx][0],
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
 	}

-	//	if ( comms_send )
-	Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
+	if ( comms_send || comms_recv )
+	  Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 	face_idx++;

 	//spointers[0] -- low
@@ -1226,8 +1232,8 @@ public:
 	  int nbr_plane = nbr_ic;
 	  assert (sx == nbr_ox);

-	  auto rp = &u_simd_recv_buf[i        ][u_comm_offset];
-	  auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset];
+	  auto rp = &u_simd_recv_buf[i        ][comm_off];
+	  auto sp = &u_simd_send_buf[nbr_plane][comm_off];

 	  if(nbr_proc){

@@ -1253,9 +1259,12 @@ public:
 	  }
 	}

-	AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
+	if ( comms_recv ) {
+	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
+	}

 	u_comm_offset     +=buffer_size;
+
      }
    }
    return 0;
@@ -208,5 +208,46 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
 }


+
+//////////////////////////////////////////////////////////////////////////////////
+//Copy a single lane of a SIMD tensor type from one object to another
+//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
+///////////////////////////////////////////////////////////////////////////////////
+template<class vobjOut, class vobjIn>
+accelerator_inline 
+void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
+{
+  static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
+
+  typedef typename vobjOut::vector_type ovector_type;  
+  typedef typename vobjIn::vector_type ivector_type;  
+  constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type);
+  constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type);
+  static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" );
+
+  typedef typename vobjOut::scalar_type oscalar_type;  
+  typedef typename vobjIn::scalar_type iscalar_type;  
+  typedef typename ExtractTypeMap<oscalar_type>::extract_type oextract_type;
+  typedef typename ExtractTypeMap<iscalar_type>::extract_type iextract_type;
+
+  typedef oextract_type * opointer;
+  typedef iextract_type * ipointer;
+
+  constexpr int oNsimd=ovector_type::Nsimd();
+  constexpr int iNsimd=ivector_type::Nsimd();
+
+  iscalar_type itmp;
+  oscalar_type otmp;
+
+  opointer __restrict__  op = (opointer)&vecOut;
+  ipointer __restrict__  ip = (ipointer)&vecIn;
+  for(int w=0;w<owords;w++){
+    memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) );
+    otmp = itmp; //potential precision change
+    memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) );
+  }
+}
+
+
 NAMESPACE_END(Grid);

@@ -6,9 +6,17 @@ uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};

+#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
+#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
+#define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID"
+#define ENV_RANK_SLURM         "SLURM_PROCID"
+#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
+#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
+
 #ifdef GRID_CUDA
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
+cudaStream_t cpuStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -17,12 +25,6 @@ void acceleratorInit(void)

  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
-#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
-#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
-#define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID"
-#define ENV_RANK_SLURM         "SLURM_PROCID"
-#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
-#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_SLURM  )) != NULL) { world_rank = atoi(localRankStr);}
@@ -97,6 +99,7 @@ void acceleratorInit(void)

  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
+  cudaStreamCreate(&cpuStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@@ -111,6 +114,7 @@ void acceleratorInit(void)
 #ifdef GRID_HIP
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
+hipStream_t cpuStream;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -119,10 +123,6 @@ void acceleratorInit(void)

  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
-#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
-#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
-#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
-#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
@@ -134,8 +134,10 @@ void acceleratorInit(void)
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
+  if ((localRankStr = getenv(ENV_RANK_SLURM  )) != NULL) { world_rank = atoi(localRankStr);}

-  printf("world_rank %d has %d devices\n",world_rank,nDevices);
+  if ( world_rank == 0 ) 
+    printf("world_rank %d has %d devices\n",world_rank,nDevices);
  size_t totalDeviceMem=0;
  for (int i = 0; i < nDevices; i++) {

@@ -181,6 +183,7 @@ void acceleratorInit(void)
 #endif
  hipSetDevice(device);
  hipStreamCreate(&copyStream);
+  hipStreamCreate(&cpuStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
@@ -208,10 +211,7 @@ void acceleratorInit(void)
  
  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
-#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
-#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
-#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
-#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
+
  // We extract the local rank initialization using an environment variable
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
@@ -107,6 +107,7 @@ void     acceleratorInit(void);

 extern int acceleratorAbortOnGpuError;
 extern cudaStream_t copyStream;
+extern cudaStream_t cpuStream;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -134,7 +135,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
+    LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda);	\
  }

 #define accelerator_for6dNB(iter1, num1,				\
@@ -153,7 +154,7 @@ inline void cuda_mem(void)
    };									\
    dim3 cu_blocks (num1,num2,num3);					\
    dim3 cu_threads(num4,num5,num6);					\
-    Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
+    Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
  }

 template<typename lambda>  __global__
@@ -189,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,

 #define accelerator_barrier(dummy)					\
  {									\
-    cudaDeviceSynchronize();						\
+    cudaStreamSynchronize(cpuStream);					\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
      printf("accelerator_barrier(): Cuda error %s \n",			\
@@ -339,6 +340,7 @@ NAMESPACE_BEGIN(Grid);
 #define accelerator_inline __host__ __device__ inline

 extern hipStream_t copyStream;
+extern hipStream_t cpuStream;
 /*These routines define mapping from thread grid to loop & vector lane indexing */
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
@@ -360,12 +362,12 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
    if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
      hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads,		\
-            0,0,						\
-            num1,num2,nsimd, lambda);				\
+			 0,cpuStream,					\
+			 num1,num2,nsimd, lambda);			\
    } else { \
      hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
-            0,0,						\
-            num1,num2,nsimd, lambda);				\
+			 0,cpuStream,					\
+			 num1,num2,nsimd, lambda);			\
    } \
  }

@@ -398,7 +400,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)

 #define accelerator_barrier(dummy)				\
  {								\
-    hipDeviceSynchronize();					\
+    hipStreamSynchronize(cpuStream);					\
    auto err = hipGetLastError();				\
    if ( err != hipSuccess ) {					\
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -0,0 +1,918 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//Production binary for the 40ID G-parity ensemble
+
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+struct EOFAparameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
+				  OneFlavourRationalParams, rat_params,
+				  double, action_tolerance,
+				  double, action_mixcg_inner_tolerance,
+				  double, md_tolerance,
+				  double, md_mixcg_inner_tolerance);
+
+  EOFAparameters() { 
+    action_mixcg_inner_tolerance = 1e-8;
+    action_tolerance = 1e-10;
+    md_tolerance = 1e-8;
+    md_mixcg_inner_tolerance = 1e-8;
+
+    rat_params.lo = 1.0;
+    rat_params.hi = 25.0;
+    rat_params.MaxIter  = 50000;
+    rat_params.tolerance= 1.0e-9;
+    rat_params.degree   = 14;
+    rat_params.precision= 50;
+  }
+};
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+				  RealD, TrajectoryLength,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  std::vector<EOFAparameters>, eofa_l,
+				  RatQuoParameters, rat_quo_s,
+				  RatQuoParameters, rat_quo_DSDR);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+    TrajectoryLength = 1.0;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 50000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+//action_or_md toggles checking the action (0), MD (1) or both (2) setups
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr, int action_or_md){
+  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+
+  PowerMethod<FermionFieldD> power_method;
+  RealD lambda_max;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
+
+  lambda_max = power_method(MdagM,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
+  lambda_max = power_method(VdagV,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  if(action_or_md == 0 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  if(action_or_md == 1 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 50000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+}
+
+
+template<typename FermionImplPolicy>
+void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  RealD scale = std::sqrt(0.5);
+  gaussian(rng,eta); eta = eta * scale;
+
+  //Use the inbuilt check
+  EOFA.refresh(latt, eta);
+  EOFA.S(latt);
+  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
+}
+
+
+template<typename FermionImplPolicy>
+class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
+  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
+}
+
+//Applications of M^{-1} cost the same as M for EOFA!
+template<typename FermionImplPolicy>
+class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
+  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
+}
+
+
+NAMESPACE_BEGIN(Grid);
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      MPCG.InnerTolerance = InnerTolerance;
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD Tolerance;
+    Integer MaxIterations;
+
+    RealD Delta; //reliable update parameter
+
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+    
+    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
+								  RealD delta,
+								  Integer maxit, 
+								  GridBase* _sp_grid4, 
+								  GridBase* _sp_grid5, 
+								  FermionOperatorF &_FermOpF,
+								  FermionOperatorD &_FermOpD,
+								  SchurOperatorF   &_LinOpF,
+								  SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      Delta(delta),
+      MaxIterations(maxit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5)
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+NAMESPACE_END(Grid);
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+
+  std::string serial_seeds = "1 2 3 4 5";
+  std::string parallel_seeds = "6 7 8 9 10";
+
+  int i=1;
+  while(i < argc){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+      i+=2;
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+      i++;
+    }else if(sarg == "--set_seeds"){ //set the rng seeds. Expects two vector args, e.g.  --set_seeds 1.2.3.4 5.6.7.8
+      assert(i < argc-2);
+      std::vector<int> tmp;
+      GridCmdOptionIntVector(argv[i+1],tmp);
+      {
+	std::stringstream ss;
+	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
+	ss << tmp.back();
+	serial_seeds = ss.str();
+      }
+      GridCmdOptionIntVector(argv[i+2],tmp);
+      {
+	std::stringstream ss;
+	for(int j=0;j<tmp.size()-1;j++) ss << tmp[j] << " ";
+	ss << tmp.back();
+	parallel_seeds = ss.str();
+      }
+      i+=3;
+      std::cout << GridLogMessage << "Set serial seeds to " << serial_seeds << std::endl;
+      std::cout << GridLogMessage << "Set parallel seeds to " << parallel_seeds << std::endl;
+      
+    }else{
+      i++;
+    }
+  }
+
+  
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    {
+      Grid::XmlWriter wr(param_file + ".templ");
+      write(wr, "Params", user_params);
+    }
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+
+  typedef GparityMobiusEOFAFermionD EOFAactionD;
+  typedef GparityMobiusFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityMobiusEOFAFermionF EOFAactionF;
+  typedef GparityMobiusFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  MD.name    = std::string("MinimumNorm2");
+
+  // typedef ConjugateHMCRunnerD<ForceGradient> HMCWrapper;
+  // MD.name    = std::string("ForceGradient");
+  
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = user_params.TrajectoryLength;
+
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = serial_seeds;
+  RNGpar.parallel_seeds = parallel_seeds;
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+  //aiming for ainv=1.723 GeV
+  //                                  me         bob
+  //Estimated  a(ml+mres) [40ID] = 0.001305    0.00131
+  //           a(mh+mres) [40ID] = 0.035910    0.03529
+  //Estimate Ls=12, b+c=2  mres~0.0011
+
+  //1/24/2022 initial mres measurement gives mres=0.001,  adjusted light quark mass to 0.0003 from 0.0001
+  
+  const int Ls      = 12;
+  Real beta         = 1.848;
+  Real light_mass   = 0.0003;
+  Real strange_mass = 0.0342;
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD mobius_scale = 2.; //b+c
+
+  RealD mob_bmc = 1.0;
+  RealD mob_b = (mobius_scale + mob_bmc)/2.;
+  RealD mob_c = (mobius_scale - mob_bmc)/2.;
+
+  std::cout << GridLogMessage
+	    << "Ensemble parameters:" << std::endl
+	    << "Ls=" << Ls << std::endl
+	    << "beta=" << beta << std::endl
+	    << "light_mass=" << light_mass << std::endl
+	    << "strange_mass=" << strange_mass << std::endl
+	    << "mobius_scale=" << mobius_scale << std::endl;
+  
+  //Setup the Grids
+  auto UGridD   = TheHMC.Resources.GetCartesian();
+  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(UGridD);
+  LatticeGaugeFieldF Uf(UGridF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
+  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
+
+
+  /////////////////////////////////////////////////////////////
+  // Light EOFA action
+  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
+  /////////////////////////////////////////////////////////////
+  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
+  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
+  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
+  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
+
+
+  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
+  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
+  int n_light_hsb = 5;
+  assert(user_params.eofa_l.size() == n_light_hsb);
+  
+  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
+
+  for(int i=0;i<n_light_hsb;i++){
+    RealD iml = eofa_light_masses[i];
+    RealD ipv = eofa_pv_masses[i];
+
+    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+
+    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
+    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
+    
+    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
+    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
+
+#if 1
+    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
+    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+#else
+    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
+    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
+#endif
+
+    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
+							*LopD, *RopD, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*DerivMCG_L, *DerivMCG_R, 
+							user_params.eofa_l[i].rat_params, true);
+    EOFA_pfactions[i] = EOFA;
+    Level1.push_back(EOFA);
+  }
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 50000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+  ///////////////////////////////////
+  // DSDR action
+  ///////////////////////////////////
+  RealD dsdr_mass=-1.8;   
+  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
+  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
+  RealD dsdr_epsilon_b = 0.5; 
+  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
+  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
+
+  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
+  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
+ 
+  RationalActionParams rat_act_params_DSDR;
+  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_DSDR.precision= 60;
+  rat_act_params_DSDR.MaxIter  = 50000;
+  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
+  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
+  Level2.push_back(&Quotient_DSDR);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool 
+    tune_rhmc_s=false, eigenrange_s=false, 
+    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
+    check_eofa=false, 
+    upper_bound_eofa=false, lower_bound_eofa(false);
+
+  std::string lanc_params_s;
+  std::string lanc_params_DSDR;
+  int tune_rhmc_s_action_or_md;
+  int tune_rhmc_DSDR_action_or_md;
+  int eofa_which_hsb;
+
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_s"){
+      assert(i < argc-1);
+      tune_rhmc_s=true;
+      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+    else if(sarg == "--tune_rhmc_DSDR"){
+      assert(i < argc-1);
+      tune_rhmc_DSDR=true;
+      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_DSDR"){
+      assert(i < argc-1);
+      eigenrange_DSDR=true;
+      lanc_params_DSDR = argv[i+1];
+    }
+    else if(sarg == "--check_eofa"){
+      assert(i < argc-1);
+      check_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
+      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+    }
+    else if(sarg == "--upper_bound_eofa"){
+      assert(i < argc-1);
+      upper_bound_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+    else if(sarg == "--lower_bound_eofa"){
+      assert(i < argc-1);
+      lower_bound_eofa = true;      
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+  }
+  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
+    std::cout << GridLogMessage << "Running checks" << std::endl;
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+
+    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+    if(check_eofa){
+      if(eofa_which_hsb >= 0){
+	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+      }else{
+	for(int i=0;i<n_light_hsb;i++){
+	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
+	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
+	}
+      }
+    }	  
+    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
+    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
+
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
@@ -0,0 +1,873 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc
+
+Copyright (C) 2015-2016
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+//Production binary for the 40ID G-parity ensemble
+
+struct RatQuoParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters,
+				  double, bnd_lo,
+				  double, bnd_hi,
+				  Integer, action_degree,
+				  double, action_tolerance,
+				  Integer, md_degree,
+				  double, md_tolerance,
+				  Integer, reliable_update_freq,
+				  Integer, bnd_check_freq);
+  RatQuoParameters() { 
+    bnd_lo = 1e-2;
+    bnd_hi = 30;
+    action_degree = 10;
+    action_tolerance = 1e-10;
+    md_degree = 10;
+    md_tolerance = 1e-8;
+    bnd_check_freq = 20;
+    reliable_update_freq = 50;
+  }
+
+  void Export(RationalActionParams &into) const{
+    into.lo = bnd_lo;
+    into.hi = bnd_hi;
+    into.action_degree = action_degree;
+    into.action_tolerance = action_tolerance;
+    into.md_degree = md_degree;
+    into.md_tolerance = md_tolerance;
+    into.BoundsCheckFreq = bnd_check_freq;
+  }
+};
+
+struct EOFAparameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters,
+				  OneFlavourRationalParams, rat_params,
+				  double, action_tolerance,
+				  double, action_mixcg_inner_tolerance,
+				  double, md_tolerance,
+				  double, md_mixcg_inner_tolerance);
+
+  EOFAparameters() { 
+    action_mixcg_inner_tolerance = 1e-8;
+    action_tolerance = 1e-10;
+    md_tolerance = 1e-8;
+    md_mixcg_inner_tolerance = 1e-8;
+
+    rat_params.lo = 1.0;
+    rat_params.hi = 25.0;
+    rat_params.MaxIter  = 10000;
+    rat_params.tolerance= 1.0e-9;
+    rat_params.degree   = 14;
+    rat_params.precision= 50;
+  }
+};
+
+struct EvolParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters,
+                                  Integer, StartTrajectory,
+                                  Integer, Trajectories,
+				  Integer, SaveInterval,
+				  Integer, Steps,
+				  RealD, TrajectoryLength,
+                                  bool, MetropolisTest,
+				  std::string, StartingType,
+				  std::vector<Integer>, GparityDirs,
+				  std::vector<EOFAparameters>, eofa_l,
+				  RatQuoParameters, rat_quo_s,
+				  RatQuoParameters, rat_quo_DSDR);
+
+  EvolParameters() {
+    //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart
+    MetropolisTest    = false;
+    StartTrajectory   = 0;
+    Trajectories      = 50;
+    SaveInterval = 5;
+    StartingType      = "ColdStart";
+    GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic
+    Steps = 5;
+    TrajectoryLength = 1.0;
+  }
+};
+
+bool fileExists(const std::string &fn){
+  std::ifstream f(fn);
+  return f.good();
+}
+
+
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+				  double, alpha,
+				  double, beta,
+				  double, mu,
+				  int, ord,
+				  int, n_stop,
+				  int, n_want,
+				  int, n_use,
+				  double, tolerance);
+
+  LanczosParameters() {
+    alpha = 35;
+    beta = 5;
+    mu = 0;
+    ord = 100;
+    n_stop = 10;
+    n_want = 10;
+    n_use = 15;
+    tolerance = 1e-6;
+  }
+};
+
+
+
+template<typename FermionActionD, typename FermionFieldD>
+void computeEigenvalues(std::string param_file,
+			GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+			FermionActionD &action, GridParallelRNG &rng){
+  
+  LanczosParameters params;
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "LanczosParameters", params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    Grid::XmlWriter wr(param_file + ".templ");
+    write(wr, "LanczosParameters", params);
+  }
+
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  action.ImportGauge(latt);
+
+  SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
+  PlainHermOp<FermionFieldD> hermop_wrap(hermop);
+  //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
+  assert(params.mu == 0.0);
+
+  Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
+  FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
+
+  std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl;
+  ImplicitlyRestartedLanczos<FermionFieldD> IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000);
+
+  std::vector<RealD> eval(params.n_use);
+  std::vector<FermionFieldD> evec(params.n_use, rbGrid);
+  int Nconv;
+  IRL.calc(eval, evec, gauss_o, Nconv);
+
+  std::cout << "Eigenvalues:" << std::endl;
+  for(int i=0;i<params.n_want;i++){
+    std::cout << i << " " << eval[i] << std::endl;
+  }
+}
+
+
+//Check the quality of the RHMC approx
+//action_or_md toggles checking the action (0), MD (1) or both (2) setups
+template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
+void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
+	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
+	       int inv_pow, const std::string &quark_descr, int action_or_md){
+  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  
+  FermionFieldD gauss_o(rbGrid);
+  FermionFieldD gauss(Grid);
+  gaussian(rng, gauss);
+  pickCheckerboard(Odd, gauss_o, gauss);
+
+  numOp.ImportGauge(latt);
+  denOp.ImportGauge(latt);
+
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  SchurDifferentiableOperator<FermionImplPolicyD> MdagM(numOp);
+  SchurDifferentiableOperator<FermionImplPolicyD> VdagV(denOp);
+
+  PowerMethod<FermionFieldD> power_method;
+  RealD lambda_max;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl;
+
+  lambda_max = power_method(MdagM,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " denominator" << std::endl;
+  lambda_max = power_method(VdagV,gauss_o);
+  std::cout << GridLogMessage << "Got lambda_max "<<lambda_max<<std::endl;
+
+  if(action_or_md == 0 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerAction); //use large tolerance to prevent exit on fail; we are trying to tune here!
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerAction);
+    std::cout << "Finished: Checking quality of RHMC action approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+
+  std::cout << "-------------------------------------------------------------------------------" << std::endl;
+
+  if(action_or_md == 1 || action_or_md == 2){
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegPowerMD); 
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, MdagM,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark numerator and power -1/" << 2*inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+    InversePowerBoundsCheck(inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << inv_pow << std::endl;
+
+    std::cout << "Starting: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+    InversePowerBoundsCheck(2*inv_pow, 10000, 1e16, VdagV,gauss_o, rhmc.ApproxNegHalfPowerMD);
+    std::cout << "Finished: Checking quality of RHMC MD approx for " << quark_descr << " quark denominator and power -1/" << 2*inv_pow << std::endl;
+  }
+}
+
+
+template<typename FermionImplPolicy>
+void checkEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+	       GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl;
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  RealD scale = std::sqrt(0.5);
+  gaussian(rng,eta); eta = eta * scale;
+
+  //Use the inbuilt check
+  EOFA.refresh(latt, eta);
+  EOFA.S(latt);
+  std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl;
+}
+
+
+template<typename FermionImplPolicy>
+class EOFAlinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl;
+  EOFAlinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl;
+}
+
+//Applications of M^{-1} cost the same as M for EOFA!
+template<typename FermionImplPolicy>
+class EOFAinvLinop: public LinearOperatorBase<typename FermionImplPolicy::FermionField>{
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA;
+  LatticeGaugeFieldD &U;
+public:
+  EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
+
+  typedef typename FermionImplPolicy::FermionField Field;
+  void OpDiag (const Field &in, Field &out){ assert(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+
+  void Op     (const Field &in, Field &out){ assert(0); }
+  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
+};
+
+template<typename FermionImplPolicy>
+void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA,
+		    GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){
+  std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl;
+  EOFAinvLinop<FermionImplPolicy> linop(EOFA, latt);
+  typename FermionImplPolicy::FermionField eta(FGrid);
+  gaussian(rng,eta);
+  PowerMethod<typename FermionImplPolicy::FermionField> power_method;
+  auto lambda_max = power_method(linop,eta);
+  std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl;
+}
+
+
+NAMESPACE_BEGIN(Grid);
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      MPCG.InnerTolerance = InnerTolerance;
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD Tolerance;
+    Integer MaxIterations;
+
+    RealD Delta; //reliable update parameter
+
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+    
+    MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, 
+								  RealD delta,
+								  Integer maxit, 
+								  GridBase* _sp_grid4, 
+								  GridBase* _sp_grid5, 
+								  FermionOperatorF &_FermOpF,
+								  FermionOperatorD &_FermOpD,
+								  SchurOperatorF   &_LinOpF,
+								  SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      Delta(delta),
+      MaxIterations(maxit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5)
+    { 
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      precisionChange(FermOpF.Umu, FermOpD.Umu);
+
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+
+      ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+
+
+NAMESPACE_END(Grid);
+
+
+
+
+
+int main(int argc, char **argv) {
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::string param_file = "params.xml";
+  bool file_load_check = false;
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--param_file"){
+      assert(i!=argc-1);
+      param_file = argv[i+1];
+    }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
+      file_load_check = true;
+    }
+  }
+
+  //Read the user parameters
+  EvolParameters user_params;
+  
+  if(fileExists(param_file)){
+    std::cout << GridLogMessage << " Reading " << param_file << std::endl;
+    Grid::XmlReader rd(param_file);
+    read(rd, "Params", user_params);
+  }else if(!GlobalSharedMemory::WorldRank){
+    std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl;
+    std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl;
+    {
+      Grid::XmlWriter wr(param_file + ".templ");
+      write(wr, "Params", user_params);
+    }
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+  //Check the parameters
+  if(user_params.GparityDirs.size() != Nd-1){
+    std::cerr << "Error in input parameters: expect GparityDirs to have size = " << Nd-1 << std::endl;
+    exit(1);
+  }
+  for(int i=0;i<Nd-1;i++)
+    if(user_params.GparityDirs[i] != 0 && user_params.GparityDirs[i] != 1){
+      std::cerr << "Error in input parameters: expect GparityDirs values to be 0 (periodic) or 1 (G-parity)" << std::endl;
+      exit(1);
+    }
+
+
+  typedef GparityMobiusEOFAFermionD EOFAactionD;
+  typedef GparityMobiusFermionD FermionActionD;
+  typedef typename FermionActionD::Impl_t FermionImplPolicyD;
+  typedef typename FermionActionD::FermionField FermionFieldD;
+
+  typedef GparityMobiusEOFAFermionF EOFAactionF;
+  typedef GparityMobiusFermionF FermionActionF;
+  typedef typename FermionActionF::Impl_t FermionImplPolicyF;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicyD,FermionImplPolicyF> MixedPrecRHMC;
+  typedef GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicyD> DoublePrecRHMC;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  typedef ConjugateHMCRunnerD<MinimumNorm2> HMCWrapper; //NB: This is the "Omelyan integrator"
+  typedef HMCWrapper::ImplPolicy GaugeImplPolicy;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps = user_params.Steps;
+  MD.trajL   = user_params.TrajectoryLength;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = user_params.StartTrajectory;
+  HMCparams.Trajectories     = user_params.Trajectories;
+  HMCparams.NoMetropolisUntil= 0;
+  HMCparams.StartingType     = user_params.StartingType;
+  HMCparams.MetropolisTest = user_params.MetropolisTest;
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix    = "ckpoint_rng";
+  CPparams.saveInterval  = user_params.SaveInterval;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  typedef PlaquetteMod<GaugeImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  //aiming for ainv=2.068             me          Bob
+  //Estimated  a(ml+mres) [48ID] = 0.001048    0.00104 
+  //           a(mh+mres) [48ID] = 0.028847    0.02805
+  //Estimate Ls=12, b+c=2  mres~0.0003
+
+  const int Ls      = 12;
+  Real beta         = 1.946;
+  Real light_mass   = 0.00074;   //0.00104 - mres_approx;
+  Real strange_mass = 0.02775;    //0.02805 - mres_approx
+  Real pv_mass      = 1.0;
+  RealD M5  = 1.8;
+  RealD mobius_scale = 2.; //b+c
+
+  RealD mob_bmc = 1.0;
+  RealD mob_b = (mobius_scale + mob_bmc)/2.;
+  RealD mob_c = (mobius_scale - mob_bmc)/2.;
+
+  //Setup the Grids
+  auto UGridD   = TheHMC.Resources.GetCartesian();
+  auto UrbGridD = TheHMC.Resources.GetRBCartesian();
+  auto FGridD     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
+  auto FrbGridD   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
+
+  GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+  ConjugateIwasakiGaugeActionD GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeFieldD Ud(UGridD);
+  LatticeGaugeFieldF Uf(UGridF);
+ 
+  //Setup the BCs
+  FermionActionD::ImplParams Params;
+  for(int i=0;i<Nd-1;i++) Params.twists[i] = user_params.GparityDirs[i]; //G-parity directions
+  Params.twists[Nd-1] = 1; //APBC in time direction
+
+  std::vector<int> dirs4(Nd);
+  for(int i=0;i<Nd-1;i++) dirs4[i] = user_params.GparityDirs[i];
+  dirs4[Nd-1] = 0; //periodic gauge BC in time
+
+  GaugeImplPolicy::setDirections(dirs4); //gauge BC
+
+  //Run optional gauge field checksum checker and exit
+  if(file_load_check){
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1); //light quark + strange quark
+  ActionLevel<HMCWrapper::Field> Level2(4); //DSDR
+  ActionLevel<HMCWrapper::Field> Level3(2); //gauge
+
+
+  /////////////////////////////////////////////////////////////
+  // Light EOFA action
+  // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc
+  /////////////////////////////////////////////////////////////
+  typedef SchurDiagMooeeOperator<EOFAactionD,FermionFieldD> EOFAschuropD;
+  typedef SchurDiagMooeeOperator<EOFAactionF,FermionFieldF> EOFAschuropF;
+  typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction<FermionImplPolicyD, FermionImplPolicyF> EOFAmixPrecPFaction;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_mxCG;
+  typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction<EOFAactionD, EOFAactionF, EOFAschuropD, EOFAschuropF> EOFA_relupCG;
+  
+  std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
+  std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
+  int n_light_hsb = 5;
+  assert(user_params.eofa_l.size() == n_light_hsb);
+  
+  EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
+
+  for(int i=0;i<n_light_hsb;i++){
+    RealD iml = eofa_light_masses[i];
+    RealD ipv = eofa_pv_masses[i];
+
+    EOFAactionD* LopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionF* LopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, iml, iml, ipv, 0.0, -1, M5, mob_b, mob_c, Params);
+    EOFAactionD* RopD = new EOFAactionD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+    EOFAactionF* RopF = new EOFAactionF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, ipv, iml, ipv, -1.0, 1, M5, mob_b, mob_c, Params);
+
+    EOFAschuropD* linopL_D = new EOFAschuropD(*LopD);
+    EOFAschuropD* linopR_D = new EOFAschuropD(*RopD);
+    
+    EOFAschuropF* linopL_F = new EOFAschuropF(*LopF);
+    EOFAschuropF* linopR_F = new EOFAschuropF(*RopF);
+
+#if 1
+    //Note reusing user_params.eofa_l.action(|md)_mixcg_inner_tolerance  as Delta for now
+    EOFA_relupCG* ActionMCG_L = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* ActionMCG_R = new EOFA_relupCG(user_params.eofa_l[i].action_tolerance, user_params.eofa_l[i].action_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+    EOFA_relupCG* DerivMCG_L = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    EOFA_relupCG* DerivMCG_R = new EOFA_relupCG(user_params.eofa_l[i].md_tolerance, user_params.eofa_l[i].md_mixcg_inner_tolerance, 50000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+
+#else
+    
+    EOFA_mxCG* ActionMCG_L = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    ActionMCG_L->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D);
+    DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D);
+    DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance;
+    
+    std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl;
+    std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl;
+#endif
+
+    
+    EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF,
+							*LopD, *RopD, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*ActionMCG_L, *ActionMCG_R, 
+							*DerivMCG_L, *DerivMCG_R, 
+							user_params.eofa_l[i].rat_params, true);
+    EOFA_pfactions[i] = EOFA;
+    Level1.push_back(EOFA);
+  }
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params);
+
+  FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params);
+  FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params);
+
+  RationalActionParams rat_act_params_s;
+  rat_act_params_s.inv_pow  = 4; // (M^dag M)^{1/4}
+  rat_act_params_s.precision= 60;
+  rat_act_params_s.MaxIter  = 10000;
+  user_params.rat_quo_s.Export(rat_act_params_s);
+  std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); 
+  DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); 
+  Level1.push_back(&Quotient_s);  
+
+  ///////////////////////////////////
+  // DSDR action
+  ///////////////////////////////////
+  RealD dsdr_mass=-1.8;   
+  //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf
+  RealD dsdr_epsilon_f = 0.02; //numerator (in determinant)
+  RealD dsdr_epsilon_b = 0.5; 
+  GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params);
+  GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params);
+
+  GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params);
+  GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params);
+ 
+  RationalActionParams rat_act_params_DSDR;
+  rat_act_params_DSDR.inv_pow  = 2; // (M^dag M)^{1/2}
+  rat_act_params_DSDR.precision= 60;
+  rat_act_params_DSDR.MaxIter  = 10000;
+  user_params.rat_quo_DSDR.Export(rat_act_params_DSDR);
+  std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl;
+
+  DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR);
+  Level2.push_back(&Quotient_DSDR);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+
+
+  //Action tuning
+  bool 
+    tune_rhmc_s=false, eigenrange_s=false, 
+    tune_rhmc_DSDR=false, eigenrange_DSDR=false, 
+    check_eofa=false, 
+    upper_bound_eofa=false, lower_bound_eofa(false);
+
+  std::string lanc_params_s;
+  std::string lanc_params_DSDR;
+  int tune_rhmc_s_action_or_md;
+  int tune_rhmc_DSDR_action_or_md;
+  int eofa_which_hsb;
+
+  for(int i=1;i<argc;i++){
+    std::string sarg(argv[i]);
+    if(sarg == "--tune_rhmc_s"){
+      assert(i < argc-1);
+      tune_rhmc_s=true;
+      tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_s"){
+      assert(i < argc-1);
+      eigenrange_s=true;
+      lanc_params_s = argv[i+1];
+    }
+    else if(sarg == "--tune_rhmc_DSDR"){
+      assert(i < argc-1);
+      tune_rhmc_DSDR=true;
+      tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
+    }
+    else if(sarg == "--eigenrange_DSDR"){
+      assert(i < argc-1);
+      eigenrange_DSDR=true;
+      lanc_params_DSDR = argv[i+1];
+    }
+    else if(sarg == "--check_eofa"){
+      assert(i < argc-1);
+      check_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
+      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+    }
+    else if(sarg == "--upper_bound_eofa"){
+      assert(i < argc-1);
+      upper_bound_eofa = true;
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+    else if(sarg == "--lower_bound_eofa"){
+      assert(i < argc-1);
+      lower_bound_eofa = true;      
+      eofa_which_hsb = std::stoi(argv[i+1]);
+      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+    }
+  }
+  if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
+    std::cout << GridLogMessage << "Running checks" << std::endl;
+    TheHMC.initializeGaugeFieldAndRNGs(Ud);
+
+    //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl;
+    //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl;
+
+
+    if(check_eofa){
+      if(eofa_which_hsb >= 0){
+	std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+	checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl;
+      }else{
+	for(int i=0;i<n_light_hsb;i++){
+	  std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << i << std::endl;
+	  checkEOFA(*EOFA_pfactions[i], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+	  std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << i << std::endl;
+	}
+      }
+    }	  
+    if(upper_bound_eofa) upperBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(lower_bound_eofa) lowerBoundEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud);
+    if(eigenrange_s) computeEigenvalues<FermionActionD, FermionFieldD>(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_s) checkRHMC<FermionActionD, FermionFieldD, decltype(Quotient_s)>(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange",  tune_rhmc_s_action_or_md);
+    if(eigenrange_DSDR) computeEigenvalues<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField>(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG());
+    if(tune_rhmc_DSDR) checkRHMC<GparityWilsonTMFermionD, GparityWilsonTMFermionD::FermionField, decltype(Quotient_DSDR)>(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md);
+
+
+    std::cout << GridLogMessage << " Done" << std::endl;
+    Grid_finalize();
+    return 0;
+  }
+
+
+  //Run the HMC
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.Run();
+
+  std::cout << GridLogMessage << " Done" << std::endl;
+  Grid_finalize();
+  return 0;
+} // main
@@ -102,7 +102,7 @@ int main(int argc, char **argv) {
  SFRp.hi       = 30.0;
  SFRp.MaxIter  = 10000;
  SFRp.tolerance= 1.0e-8;
-  SFRp.mdtolerance= 1.0e-6;
+  SFRp.mdtolerance= 1.0e-5;
  SFRp.degree   = 16;
  SFRp.precision= 50;
  SFRp.BoundsCheckFreq=5;
@@ -112,7 +112,7 @@ int main(int argc, char **argv) {
  OFRp.hi       = 30.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-8;
-  OFRp.mdtolerance= 1.0e-6;
+  OFRp.mdtolerance= 1.0e-5;
  OFRp.degree   = 16;
  OFRp.precision= 50;
  OFRp.BoundsCheckFreq=5;
@@ -162,15 +162,17 @@ int main(int argc, char **argv) {
  FermionAction::ImplParams Params(boundary);

  double StoppingCondition = 1e-8;
+  double MDStoppingCondition = 1e-6;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);

  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(4);
-  ActionLevel<HMCWrapper::Field> Level3(6);
+  ActionLevel<HMCWrapper::Field> Level3(8);

  ////////////////////////////////////
  // Strange action
@@ -226,7 +228,7 @@ int main(int argc, char **argv) {
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    if(h!=0) {
-      Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
+      Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
    } else {
      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
@@ -241,7 +243,7 @@ int main(int argc, char **argv) {
  for(int h=0;h<nquo-1;h++){
    Level2.push_back(Quotients[h]);
  }
-  Level1.push_back(Quotients[nquo-1]); // PV dirichlet fix on coarse timestep
+  Level2.push_back(Quotients[nquo-1]);

  /////////////////////////////////////////////////////////////
  // Gauge action
@@ -0,0 +1,419 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef MobiusFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps =  6;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 1077;
+  HMCparams.Trajectories     = 1;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_DDHMC_lat";
+  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  RealD M5  = 1.8;
+  RealD b   = 1.5;
+  RealD c   = 0.5;
+  //  Real beta         = 2.31;
+  //  Real light_mass   = 5.4e-4;
+  Real beta         = 2.13;
+  Real light_mass   = 7.8e-4;
+  Real strange_mass = 0.02132;
+  Real pv_mass      = 1.0;
+  //  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+
+  // FIXME:
+  // Same in MC and MD
+  // Need to mix precision too
+  OneFlavourRationalParams SFRp; // Strange
+  SFRp.lo       = 4.0e-3;
+  SFRp.hi       = 90.0;
+  SFRp.MaxIter  = 60000;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 1.0e-4;
+  SFRp.degree   = 12;
+  SFRp.precision= 50;
+  SFRp.BoundsCheckFreq=0;
+
+  OneFlavourRationalParams OFRp; // Up/down
+  OFRp.lo       = 2.0e-5;
+  OFRp.hi       = 90.0;
+  OFRp.MaxIter  = 60000;
+  OFRp.tolerance= 1.0e-7;
+  OFRp.mdtolerance= 1.0e-4;
+  //  OFRp.degree   = 20; converges
+  //  OFRp.degree   = 16;
+  OFRp.degree   = 12;
+  OFRp.precision= 80;
+  OFRp.BoundsCheckFreq=0;
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  ////////////////////////////////////////////////////////////////
+  // Domain decomposed
+  ////////////////////////////////////////////////////////////////
+  Coordinate latt4  = GridPtr->GlobalDimensions();
+  Coordinate mpi    = GridPtr->ProcessorGrid();
+  Coordinate shm;
+
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+  
+  Coordinate CommDim(Nd);
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+
+  Coordinate NonDirichlet(Nd+1,0);
+  Coordinate Dirichlet(Nd+1,0);
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Coordinate Block4(Nd);
+  //  Block4[0] = Dirichlet[1];
+  //  Block4[1] = Dirichlet[2];
+  //  Block4[2] = Dirichlet[3];
+  Block4[0] = 0;
+  Block4[1] = 0;
+  Block4[2] = 0;
+  Block4[3] = Dirichlet[4];
+
+  int Width=3;
+  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4,Width));
+
+  //////////////////////////
+  // Fermion Grid
+  //////////////////////////
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  Params.dirichlet=NonDirichlet;
+  FermionAction::ImplParams ParamsDir(boundary);
+  ParamsDir.dirichlet=Dirichlet;
+
+  //  double StoppingCondition = 1e-14;
+  //  double MDStoppingCondition = 1e-9;
+  double StoppingCondition = 1e-8;
+  double MDStoppingCondition = 1e-6;
+  double MaxCGIterations = 300000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  ActionLevel<HMCWrapper::Field> Level3(8);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir);
+  FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, ParamsDir);
+  
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
+  Level1.push_back(&StrangePseudoFermionBdy);
+  Level2.push_back(&StrangePseudoFermionLocal);
+  Level1.push_back(&StrangePseudoFermionPVBdy);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+  std::vector<int> dirichlet_den;
+  std::vector<int> dirichlet_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
+  }
+
+  for(int h=0;h<n_hasenbusch;h++){
+    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
+  }
+  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+  
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage
+	      << " 2f quotient Action ";
+    std::cout << "det D("<<light_den[h]<<")";
+    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
+    std::cout << "/ det D("<<light_num[h]<<")";
+    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
+    std::cout << std::endl;
+
+    FermionAction::ImplParams ParamsNum(boundary);
+    FermionAction::ImplParams ParamsDen(boundary);
+    
+    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
+    else                      ParamsNum.dirichlet = NonDirichlet;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
+
+    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
+    else                      ParamsDen.dirichlet = NonDirichlet;
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
+    
+    if(h!=0) {
+      Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
+    } else {
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+    }
+  }
+
+  int nquo=Quotients.size();
+  Level1.push_back(Bdys[0]);
+  Level1.push_back(Bdys[1]);
+  for(int h=0;h<nquo-1;h++){
+    Level2.push_back(Quotients[h]);
+  }
+  Level2.push_back(Quotients[nquo-1]);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+  /////////////////////////////////////////////////////////////
+
+  if(1){
+    // TODO:
+    // i)  Break high bound, how rapidly does it break? Tune this test.
+    // ii) Break low bound, how rapidly?
+    // iii) Run lanczos
+    // iv)  Have CG return spectral range estimate
+    FermionField vec(StrangeOp.FermionRedBlackGrid());
+    FermionField res(StrangeOp.FermionRedBlackGrid());
+    vec = 1; // Fill with any old junk
+
+    std::cout << "Bounds check on strange operator mass "<< StrangeOp.Mass()<<std::endl;
+    SchurDifferentiableOperator<FermionImplPolicy> SdagS(StrangeOp);
+    HighBoundCheck(SdagS,vec,SFRp.hi);
+    ChebyBoundsCheck(SdagS,vec,SFRp.lo,SFRp.hi);
+    std::cout << "Strange inversion"<<std::endl;
+    res=Zero();
+    //    MDCG(SdagS,vec,res);
+
+
+    std::cout << "Bounds check on light quark operator mass "<< Denominators[0]->Mass() <<std::endl;
+    SchurDifferentiableOperator<FermionImplPolicy> UdagU(*Denominators[0]);
+    HighBoundCheck(UdagU,vec,OFRp.hi);
+    ChebyBoundsCheck(UdagU,vec,OFRp.lo,OFRp.hi);
+    std::cout << "light inversion"<<std::endl;
+    res=Zero();
+    //    MDCG(UdagU,vec,res);
+
+
+    std::cout << "Bounds check on strange dirichlet operator mass "<< StrangeOpDir.Mass()<<std::endl;
+    SchurDifferentiableOperator<FermionImplPolicy> SddagSd(StrangeOpDir);
+    HighBoundCheck(SddagSd,vec,OFRp.hi);
+    ChebyBoundsCheck(SddagSd,vec,OFRp.lo,OFRp.hi);
+    std::cout << "strange dirichlet inversion"<<std::endl;
+    res=Zero();
+    //    MDCG(SddagSd,vec,res);
+
+    std::cout << "Bounds check on light dirichlet operator mass "<< Numerators[0]->Mass()<<std::endl;
+    SchurDifferentiableOperator<FermionImplPolicy> UddagUd(*Numerators[0]);
+    HighBoundCheck(UddagUd,vec,OFRp.hi);
+    ChebyBoundsCheck(UddagUd,vec,OFRp.lo,OFRp.hi);
+    std::cout << "light dirichlet inversion"<<std::endl;
+    res=Zero();
+    //MDCG(UddagUd,vec,res);
+
+    
+    auto grid4= GridPtr;
+    auto rbgrid4= GridRBPtr;
+    auto rbgrid = StrangeOp.FermionRedBlackGrid();
+    auto grid = StrangeOp.FermionGrid();
+    if(1){
+    const int Nstop = 5;
+    const int Nk = 20;
+    const int Np = 20;
+    const int Nm = Nk+Np;
+    const int MaxIt= 10000;
+    int Nconv;
+    RealD resid = 1.0e-5;
+    if(0)
+    {
+      int order = 501;
+      RealD bound = 5.0e-4;
+      std::cout << GridLogMessage << " Lanczos for dirichlet bound " << bound<<" order "<< order<<std::endl;
+      Chebyshev<FermionField> Cheby(bound,90.,order);
+      FunctionHermOp<FermionField> OpCheby(Cheby,UddagUd);
+      PlainHermOp<FermionField> Op     (UddagUd);
+      ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
+      std::vector<RealD>          eval(Nm);
+      std::vector<FermionField> evec(Nm,rbgrid);
+      FermionField    src(rbgrid);src = 1.0;
+      IRL.calc(eval,evec,src,Nconv);
+      
+      FermionField tmp(rbgrid);
+      FermionField ftmp(grid);
+      FermionField ftmp4(grid4);
+      for(int ev=0;ev<evec.size();ev++){
+	Gamma GT(Gamma::Algebra::GammaT);
+	std::cout << " evec " << ev << std::endl;
+	tmp = evec[ev] + GT*evec[ev];
+	DumpSliceNorm(" 1+gammaT ",tmp,Nd);
+	tmp = evec[ev] - GT*evec[ev];
+	DumpSliceNorm(" 1-gammaT ",tmp,Nd);
+      }
+      for(int e=0;e<10;e++){
+	std::cout << " Dirichlet evec "<<e<<std::endl;
+	tmp = evec[e];
+	for(int s=0;s<Ls;s++){
+	  ftmp=Zero();
+	  setCheckerboard(ftmp,tmp);
+	  ExtractSlice(ftmp4,ftmp,s,0);
+	  std::cout << "s-slice "<<s<< " evec[0] " << std::endl;
+	  DumpSliceNorm(" s-slice ",ftmp4,Nd-1);
+	}
+      }
+    }
+    if(1)
+    {
+      int order = 2001;
+      RealD bound = 6.0e-5;
+      std::cout << GridLogMessage << " Lanczos for full operator  bound " << bound<<" order "<< order<<std::endl;
+      Chebyshev<FermionField> Cheby(bound,90.,order);
+      FunctionHermOp<FermionField> OpCheby(Cheby,UdagU);
+      PlainHermOp<FermionField> Op     (UdagU);
+      ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
+      std::vector<RealD>          eval(Nm);
+      std::vector<FermionField> evec(Nm,rbgrid);
+      FermionField    src(rbgrid); src = 1.0;
+      IRL.calc(eval,evec,src,Nconv);
+
+      FermionField tmp(rbgrid);
+      FermionField ftmp(grid);
+      FermionField ftmp4(grid4);
+      for(int e=0;e<evec.size();e++){
+	std::cout << " Full evec "<<e<<std::endl;
+	tmp = evec[e];
+	for(int s=0;s<Ls;s++){
+	  ftmp=Zero();
+	  setCheckerboard(ftmp,tmp);
+	  ExtractSlice(ftmp4,ftmp,s,0);
+	  std::cout << "s-slice "<<s<< " evec[0] " << std::endl;
+	  DumpSliceNorm(" s-slice ",ftmp4,Nd-1);
+	}
+      }
+
+    }
+    Grid_finalize();
+    std::cout << " All done "<<std::endl;
+    exit(EXIT_SUCCESS);
+    }
+  }
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
@@ -0,0 +1,444 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_EODWFRatio.cc
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
+
+    using OperatorFunction<FieldD>::operator();
+
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+      /* Debugging instances of objects; references are stored
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+      */
+    };
+
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+      // Assumption made in code to extract gauge field
+      // We could avoid storing LinopD reference alltogether ?
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Must snarf a single precision copy of the gauge field in Linop_d argument
+      ////////////////////////////////////////////////////////////////////////////////////
+      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+
+      GridBase * GridPtrF = SinglePrecGrid4;
+      GridBase * GridPtrD = FermOpD.Umu.Grid();
+      GaugeFieldF     U_f  (GridPtrF);
+      GaugeLinkFieldF Umu_f(GridPtrF);
+      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Moving this to a Clone method of fermion operator would allow to duplicate the 
+      // physics parameters and decrease gauge field copies
+      ////////////////////////////////////////////////////////////////////////////////////
+      GaugeLinkFieldD Umu_d(GridPtrD);
+      for(int mu=0;mu<Nd*2;mu++){ 
+	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+	precisionChange(Umu_f,Umu_d);
+	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      }
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
+    }
+  };
+
+NAMESPACE_END(Grid);
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+
+   // Typedefs to simplify notation
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonImplF FermionImplPolicyF;
+  typedef MobiusFermionR FermionAction;
+  typedef MobiusFermionF FermionActionF;
+  typedef typename FermionAction::FermionField FermionField;
+  typedef typename FermionActionF::FermionField FermionFieldF;
+
+  typedef Grid::XmlReader       Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  IntegratorParameters MD;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  MD.name    = std::string("Leap Frog");
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  MD.name    = std::string("Force Gradient");
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  MD.name    = std::string("MinimumNorm2");
+  MD.MDsteps =  4;
+  MD.trajL   = 1.0;
+
+  HMCparameters HMCparams;
+  HMCparams.StartTrajectory  = 1077;
+  HMCparams.Trajectories     = 1;
+  HMCparams.NoMetropolisUntil=  0;
+  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
+  //  HMCparams.StartingType     =std::string("ColdStart");
+  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.MD = MD;
+  HMCWrapper TheHMC(HMCparams);
+
+  // Grid from the command line arguments --grid and --mpi
+  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
+
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_DDHMC_lat";
+  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
+  CPparams.saveInterval  = 1;
+  CPparams.format        = "IEEE64BIG";
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  const int Ls      = 12;
+  RealD M5  = 1.8;
+  RealD b   = 1.5;
+  RealD c   = 0.5;
+  Real beta         = 2.31;
+  //  Real light_mass   = 5.4e-4;
+  Real light_mass   = 7.8e-4;
+  Real strange_mass = 0.02132;
+  Real pv_mass      = 1.0;
+  std::vector<Real> hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass });
+
+  // FIXME:
+  // Same in MC and MD
+  // Need to mix precision too
+  OneFlavourRationalParams SFRp; // Strange
+  SFRp.lo       = 4.0e-3;
+  SFRp.hi       = 90.0;
+  SFRp.MaxIter  = 60000;
+  SFRp.tolerance= 1.0e-8;
+  SFRp.mdtolerance= 1.0e-6;
+  SFRp.degree   = 12;
+  SFRp.precision= 50;
+  SFRp.BoundsCheckFreq=0;
+
+  OneFlavourRationalParams OFRp; // Up/down
+  OFRp.lo       = 2.0e-5;
+  OFRp.hi       = 90.0;
+  OFRp.MaxIter  = 60000;
+  OFRp.tolerance= 1.0e-8;
+  OFRp.mdtolerance= 1.0e-6;
+  //  OFRp.degree   = 20; converges
+  //  OFRp.degree   = 16;
+  OFRp.degree   = 12;
+  OFRp.precision= 80;
+  OFRp.BoundsCheckFreq=0;
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
+
+  ////////////////////////////////////////////////////////////////
+  // Domain decomposed
+  ////////////////////////////////////////////////////////////////
+  Coordinate latt4  = GridPtr->GlobalDimensions();
+  Coordinate mpi    = GridPtr->ProcessorGrid();
+  Coordinate shm;
+
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+  
+  Coordinate CommDim(Nd);
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+
+  Coordinate NonDirichlet(Nd+1,0);
+  Coordinate Dirichlet(Nd+1,0);
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Coordinate Block4(Nd);
+  Block4[0] = Dirichlet[1];
+  Block4[1] = Dirichlet[2];
+  Block4[2] = Dirichlet[3];
+  Block4[3] = Dirichlet[4];
+
+  int Width=3;
+  TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4,Width));
+
+  //////////////////////////
+  // Fermion Grids
+  //////////////////////////
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi);
+  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
+  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
+  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
+
+  IwasakiGaugeActionR GaugeAction(beta);
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+  LatticeGaugeFieldF UF(GridPtrF);
+
+  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
+  TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file
+  TheHMC.initializeGaugeFieldAndRNGs(U);
+
+
+  // These lines are unecessary if BC are all periodic
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionAction::ImplParams Params(boundary);
+  Params.dirichlet=NonDirichlet;
+  FermionAction::ImplParams ParamsDir(boundary);
+  ParamsDir.dirichlet=Dirichlet;
+
+  //  double StoppingCondition = 1e-14;
+  //  double MDStoppingCondition = 1e-9;
+  double StoppingCondition = 1e-10;
+  double MDStoppingCondition = 1e-7;
+  double MDStoppingConditionLoose = 1e-6;
+  double MaxCGIterations = 300000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations);
+
+  ////////////////////////////////////
+  // Collect actions
+  ////////////////////////////////////
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  ActionLevel<HMCWrapper::Field> Level3(8);
+
+  ////////////////////////////////////
+  // Strange action
+  ////////////////////////////////////
+  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
+  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
+
+  FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir);
+  FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, ParamsDir);
+  
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
+  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
+  Level1.push_back(&StrangePseudoFermionBdy);
+  Level2.push_back(&StrangePseudoFermionLocal);
+  Level1.push_back(&StrangePseudoFermionPVBdy);
+
+  ////////////////////////////////////
+  // up down action
+  ////////////////////////////////////
+  std::vector<Real> light_den;
+  std::vector<Real> light_num;
+  std::vector<int> dirichlet_den;
+  std::vector<int> dirichlet_num;
+
+  int n_hasenbusch = hasenbusch.size();
+  light_den.push_back(light_mass);  dirichlet_den.push_back(0);
+  for(int h=0;h<n_hasenbusch;h++){
+    light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
+  }
+
+  for(int h=0;h<n_hasenbusch;h++){
+    light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
+  }
+  light_num.push_back(pv_mass);  dirichlet_num.push_back(0);
+
+  std::vector<FermionAction *> Numerators;
+  std::vector<FermionAction *> Denominators;
+  std::vector<FermionActionF *> DenominatorsF;
+  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
+  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+  std::vector<MxPCG *> ActionMPCG;
+  std::vector<MxPCG *> MPCG;
+
+  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
+  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
+  std::vector<LinearOperatorD *> LinOpD;
+  std::vector<LinearOperatorF *> LinOpF; 
+  
+  for(int h=0;h<n_hasenbusch+1;h++){
+    std::cout << GridLogMessage
+	      << " 2f quotient Action ";
+    std::cout << "det D("<<light_den[h]<<")";
+    if ( dirichlet_den[h] ) std::cout << "^dirichlet    ";
+    std::cout << "/ det D("<<light_num[h]<<")";
+    if ( dirichlet_num[h] ) std::cout << "^dirichlet    ";
+    std::cout << std::endl;
+
+    FermionAction::ImplParams ParamsNum(boundary);
+    FermionAction::ImplParams ParamsDen(boundary);
+    FermionActionF::ImplParams ParamsDenF(boundary);
+    
+    if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
+    else                      ParamsNum.dirichlet = NonDirichlet;
+    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
+
+    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
+    else                      ParamsDen.dirichlet = NonDirichlet;
+
+    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
+
+    ParamsDenF.dirichlet = ParamsDen.dirichlet;
+    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
+
+    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
+    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
+
+    double conv  = MDStoppingCondition;
+    if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
+    const int MX_inner = 5000;
+    MPCG.push_back(new MxPCG(conv,
+			     MX_inner,
+			     MaxCGIterations,
+			     GridPtrF,
+			     FrbGridF,
+			     *DenominatorsF[h],*Denominators[h],
+			     *LinOpF[h], *LinOpD[h]) );
+
+    ActionMPCG.push_back(new MxPCG(StoppingCondition,
+				   MX_inner,
+				   MaxCGIterations,
+				   GridPtrF,
+				   FrbGridF,
+				   *DenominatorsF[h],*Denominators[h],
+				   *LinOpF[h], *LinOpD[h]) );
+
+    
+    if(h!=0) {
+      //      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
+      Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
+    } else {
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+    }
+  }
+
+  int nquo=Quotients.size();
+  Level1.push_back(Bdys[0]);
+  Level1.push_back(Bdys[1]);
+  for(int h=0;h<nquo-1;h++){
+    Level2.push_back(Quotients[h]);
+  }
+  Level2.push_back(Quotients[nquo-1]);
+
+  /////////////////////////////////////////////////////////////
+  // Gauge action
+  /////////////////////////////////////////////////////////////
+  Level3.push_back(&GaugeAction);
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  TheHMC.TheAction.push_back(Level3);
+  std::cout << GridLogMessage << " Action complete "<< std::endl;
+  /////////////////////////////////////////////////////////////
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+} // main
+
+
+
@@ -0,0 +1,53 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file:
+
+Copyright (C) 2015-2016
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+int main(int argc, char **argv)
+{
+  using namespace Grid;
+
+  Grid_init(&argc, &argv);
+
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate simd   = GridDefaultSimd(Nd,vComplexD::Nsimd());
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,simd,mpi);
+
+  GridSerialRNG   sRNG;         sRNG.SeedUniqueString(std::string("The Serial RNG"));
+  GridParallelRNG pRNG(UGrid);  pRNG.SeedUniqueString(std::string("The 4D RNG"));
+
+  std::string rngfile("ckpoint_rng.0");
+  NerscIO::writeRNGState(sRNG, pRNG, rngfile);
+  
+  Grid_finalize();
+}
+
+
+
@@ -191,9 +191,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -249,8 +249,9 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

-  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  Dw.DirichletBlock(Dirichlet);
+  DomainWallFermionF::ImplParams p;
+  p.dirichlet=Dirichlet;
+  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
  Dw.ImportGauge(Umu);
  
  int ncall =300;
@@ -261,9 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-      __SSC_START;
      Dw.Dhop(src,result,0);
-      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -81,8 +81,8 @@ int main (int argc, char ** argv)
    Vector<Coeff_t> diag = Dw.bs;
    Vector<Coeff_t> upper= Dw.cs;
    Vector<Coeff_t> lower= Dw.cs;
-    upper[Ls-1]=-Dw.mass*upper[Ls-1];
-    lower[0]   =-Dw.mass*lower[0];
+    upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
+    lower[0]   =-Dw.mass_plus*lower[0];
    
    LatticeFermion r_eo(FGrid);
    LatticeFermion src_e (FrbGrid);
@@ -44,6 +44,13 @@ void bench_wilson (
 		   double const     volume,
 		   int const           dag );

+void bench_wilson_eo (
+       LatticeFermion &    src,
+       LatticeFermion & result,
+       WilsonFermionR &     Dw,
+       double const     volume,
+       int const           dag );
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -110,8 +117,8 @@ int main (int argc, char ** argv)
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << "\t";
    // EO
-	  bench_wilson(src,result,Dw,volume,DaggerNo);
-	  bench_wilson(src,result,Dw,volume,DaggerYes);
+    bench_wilson_eo(src_o,result_e,Dw,volume,DaggerNo);
+    bench_wilson_eo(src_o,result_e,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
 	}
    }
@@ -159,7 +159,7 @@ case ${ac_ZMOBIUS} in
 esac
 ############### Nc
 AC_ARG_ENABLE([Nc],
-    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
+    [AC_HELP_STRING([--enable-Nc=2|3|4|5], [enable number of colours])],
    [ac_Nc=${enable_Nc}], [ac_Nc=3])

 case ${ac_Nc} in
@@ -394,11 +394,10 @@ case ${CXXTEST} in
    fi
    ;;
  hipcc)
-#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
    CXXLD=${CXX}
    if test $ac_openmp = yes; then
-       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
+       CXXFLAGS="$CXXFLAGS -fopenmp"
    fi
    ;;
  dpcpp)
@@ -557,16 +556,19 @@ esac
 AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice],
              [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no])
 case ${ac_SETDEVICE} in
-    yes);;
-    no)
+    yes)
+	echo ENABLE SET DEVICE
+    ;;
+    *)
     AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] )
+   	echo DISABLE SET DEVICE
    ;;
 esac

 #########################################################
 ######################  Shared memory intranode #########
 #########################################################
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no],
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no|none],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no])

 case ${ac_SHM} in
@@ -586,7 +588,7 @@ case ${ac_SHM} in
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
     ;;

-     shmnone | no)
+     shmnone | no | none)
     AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
     ;;

@@ -93,14 +93,14 @@ template<class Field> class FreeLaplacianStencil : public SparseMatrixBase<Field
 {
 public:
  typedef typename Field::vector_object siteObject;
-  typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
+  typedef CartesianStencil<siteObject, siteObject, SimpleStencilParams> StencilImpl;

  GridBase *grid;
  StencilImpl Stencil;
  SimpleCompressor<siteObject> Compressor;
  
  FreeLaplacianStencil(GridBase *_grid)
-    : Stencil    (_grid,6,Even,directions,displacements,0), grid(_grid)
+    : Stencil    (_grid,6,Even,directions,displacements,SimpleStencilParams()), grid(_grid)
  {  };
  
  virtual GridBase *Grid(void) { return grid; };
@@ -168,7 +168,8 @@ public:
  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;

-  typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
+  typedef CartesianStencil<siteObject, siteObject,SimpleStencilParams> StencilImpl;
+  SimpleStencilParams p;

  GridBase *grid;
  StencilImpl Stencil;
@@ -177,7 +178,7 @@ public:
  CovariantLaplacianStencil(GaugeField &Umu)
    :
      grid(Umu.Grid()),
-      Stencil    (grid,6,Even,directions,displacements,0),
+      Stencil    (grid,6,Even,directions,displacements,p),
      Uds(grid)
  {
    for (int mu = 0; mu < Nd; mu++) {
@@ -9,6 +9,7 @@
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
- LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
-HIPFLAGS = --amdgpu-target=gfx90a
+CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include " \
+ LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
+
+
@@ -12,19 +12,21 @@
 #SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4

 DIR=.
-module list
+source sourceme.sh
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
-#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-export MPICH_SMP_SINGLE_COPY_MODE=NONE
-#export MPICH_SMP_SINGLE_COPY_MODE=CMA
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 export OMP_NUM_THREADS=1

 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE

-PARAMS=" --accelerator-threads 16 --grid 32.32.32.256 --mpi 1.1.1.8 --comms-overlap --shm 2048 --shm-mpi 0"
-echo $PARAMS
+echo working directory
+pwd
+
+PARAMS=" --accelerator-threads 8 --grid 32.32.32.32 --mpi 1.1.1.1 --comms-sequential --shm 2048 --shm-mpi 0"
+srun --gpus-per-task 1 -n1  ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+
+PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.1 --comms-sequential --shm 2048 --shm-mpi 0"
 srun --gpus-per-task 1 -n8  ./benchmarks/Benchmark_dwf_fp32 $PARAMS


-
@@ -7,21 +7,19 @@
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
 #SBATCH -N 1
-#SBATCH -n 4
-#SBATCH --exclusive
+#SBATCH -n 2
+#SBATCH --gpu-bind=map_gpu:0,1

 DIR=.
-module list
+source setup.sh
+
+export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
-#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
-export MPICH_SMP_SINGLE_COPY_MODE=NONE
-#export MPICH_SMP_SINGLE_COPY_MODE=CMA
-export OMP_NUM_THREADS=4
+export OMP_NUM_THREADS=16

 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
-PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"

-srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
+srun --gpus-per-task 1 -N1 -n2  ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.16.32.64 --shm-mpi 1 --shm 2048 --comms-sequential --accelerator-threads 8


@@ -6,43 +6,23 @@
 #SBATCH -J DWF
 #SBATCH -o DWF.%J
 #SBATCH -e DWF.%J
-#SBATCH -N 8
-#SBATCH -n 64
-#SBATCH --exclusive
-#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
+#SBATCH -N 1
+#SBATCH -n 8
+##SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
+#SBATCH --gpu-bind=map_gpu:0,1,2,3,6,7,4,5

 DIR=.
-module list
+source setup.sh
+
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 export MPICH_GPU_SUPPORT_ENABLED=1
 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA
-export MPICH_SMP_SINGLE_COPY_MODE=NONE
-export OMP_NUM_THREADS=1
+#export MPICH_SMP_SINGLE_COPY_MODE=NONE
+export OMP_NUM_THREADS=16

 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE

-PARAMS=" --accelerator-threads 16 --grid 64.64.64.256 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
-echo $PARAMS
-#srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.256.8node
-
-
-PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 1"
-echo $PARAMS
-srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node
-
-PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 0"
-echo $PARAMS
-#srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node.shm0
-
-PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
-echo $PARAMS
-#srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_ITT $PARAMS > itt.8node
-
-PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
-echo $PARAMS
-#srun --gpus-per-task 1 -N8 -n64  ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0
-
-
+srun --gpus-per-task 1 -N1 -n8  ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.1 --shm-mpi 1 --shm 2048 --comms-sequential --accelerator-threads 8

@@ -1,6 +1,9 @@
 module load PrgEnv-gnu
-module load rocm/4.5.0
+module load rocm/5.1.0
+module load cray-mpich/8.1.16
 module load gmp
-module load cray-fftw
+#module load cray-fftw
 module load craype-accel-amd-gfx90a
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+#Hack for lib
+export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
@@ -1,9 +1,14 @@
+DIR=`pwd`
+PREFIX=$DIR/../Prequisites/install/
 ../../configure \
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-shm=nvlink \
    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
+    --enable-setdevice \
+    --disable-accelerator-cshift \
+    --with-gmp=$PREFIX \
    --disable-fermion-reps \
    --disable-unified \
    --disable-gparity \
@@ -1,24 +1,27 @@
 #!/bin/bash
-#SBATCH -A mp13
+#SBATCH -A m3886_g
 #SBATCH -C gpu
-#SBATCH -q regular
+#SBATCH -q debug
 #SBATCH -t 0:20:00
-#SBATCH -n 16
-#SBATCH --ntasks-per-node=4
 #SBATCH -c 32
-#SBATCH --exclusive
+#SBATCH -N 1
+#SBATCH -n 4
+#SBATCH --ntasks-per-node=4
 #SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=map_gpu:0,1,2,3
+#SBATCH --exclusive
+#SBATCH --gpu-bind=none

 export SLURM_CPU_BIND="cores"
-export MPICH_RDMA_ENABLED_CUDA=1
 export MPICH_GPU_SUPPORT_ENABLED=1
-srun ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.2  --accelerator-threads 8 > comms.4node
+export MPICH_RDMA_ENABLED_CUDA=1
+export MPICH_GPU_IPC_ENABLED=1
+export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
+export MPICH_GPU_NO_ASYNC_MEMCPY=0
+#export MPICH_SMP_SINGLE_COPY_MODE=CMA

-OPT="--comms-overlap --comms-concurrent --shm-mpi 0"
-srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt0
-srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt0
+OPT="--comms-sequential --shm-mpi 1"
+VOL=64.64.64.64
+srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.1.1 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
+#srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.1.1.4 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
+#srun ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.8 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT

-OPT="--comms-overlap --comms-concurrent --shm-mpi 1"
-srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt1
-srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt1
@@ -1,4 +1,4 @@

 export CRAY_ACCEL_TARGET=nvidia80

-module load PrgEnv-gnu cpe-cuda cuda
+module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4
@@ -2,11 +2,12 @@
 	      --enable-simd=GPU \
 	      --enable-gen-simd-width=32 \
 	      --enable-unified=no \
-	       --enable-shm=nvlink \
+	       --enable-shm=no \
 	       --disable-gparity \
-	       --enable-setdevice \
+	       --disable-setdevice \
 	       --disable-fermion-reps \
 	       --enable-accelerator=cuda \
+	       --enable-accelerator-cshift \
 	       --prefix /ccs/home/paboyle/prefix \
 	       CXX=nvcc \
 	       LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
@@ -1,25 +1,39 @@
 #!/bin/bash
 #BSUB -P LGT104
-#BSUB -W 2:00
+#BSUB -W 0:20
 #BSUB -nnodes 16
 #BSUB -J DWF

+
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
-export OPT="--comms-concurrent --comms-overlap "

-APP="./benchmarks/Benchmark_comms_host_device  --mpi 4.4.4.3 "
-jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log
+DIR=.
+source sourceme.sh

-APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
-jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log
+echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE

-APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
-jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log
+VOLS=( 32.32.32.16 32.32.32.64 64.32.32.64 64.32.64.64 64.64.64.64 64.64.64.128  64.64.64.256  64.64.64.512 128.64.64.64.512)
+MPI=( 1.1.1.1      1.1.1.4     2.1.1.4         2.1.2.4     2.2.2.4      2.2.2.8      2.2.2.16      2.2.2.32 4.4.2.32 )
+RANKS=(     1            4           8              16          32          64            128           256 1024)
+NODES=(     1            1           2               4           8           16            32            64  128)
+INTS=(      0            1           2               3           4            5             6             7    8)

+for i in 5
+do
+    vol=${VOLS[$i]} 
+    nodes=${NODES[$i]} 
+    mpi=${MPI[$i]} 
+    ranks=${RANKS[$i]} 

+    JSRUN="jsrun --nrs $nodes -a4 -g4 -c42 -dpacked -b packed:10 --latency_priority gpu-cpu --smpiargs=-gpu"

+    PARAMS=" --accelerator-threads 8 --grid $vol --mpi $mpi --comms-sequential --shm 2048 --shm-mpi 0"
+    $JSRUN ./benchmarks/Benchmark_dwf_fp32 $PARAMS > run.v${vol}.n${nodes}.m${mpi}.seq.ker

+    PARAMS=" --accelerator-threads 8 --grid $vol --mpi $mpi --comms-overlap --shm 2048 --shm-mpi 0"
+    $JSRUN ./benchmarks/Benchmark_dwf_fp32 $PARAMS > run.v${vol}.n${nodes}.m${mpi}.over.ker

+done

@@ -0,0 +1,184 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/IO/Test_field_array_io.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+//This test demonstrates and checks a single-file write of an arbitrary array of fields
+
+uint64_t writeHeader(const uint32_t size, const uint32_t checksum, const std::string &format, const std::string &file){
+  std::ofstream fout(file,std::ios::out|std::ios::in);
+  fout.seekp(0,std::ios::beg);
+  fout << std::setw(10) << size << std::endl;
+  fout << std::hex << std::setw(10) << checksum << std::endl;
+  fout << format << std::endl;
+  return fout.tellp();
+}
+ 
+uint64_t readHeader(uint32_t &size, uint32_t &checksum, std::string &format, const std::string &file){
+  std::ifstream fin(file);
+  std::string line;
+  getline(fin,line);
+  {
+    std::stringstream ss; ss <<line ; ss >> size;
+  }
+  getline(fin,line);
+  {
+    std::stringstream ss; ss <<line ; ss >> std::hex >> checksum;
+  }
+  getline(fin,format);
+  removeWhitespace(format);
+      
+  return fin.tellg();
+}
+ 
+template<typename FieldType>
+void writeFieldArray(const std::string &file, const std::vector<FieldType> &data){
+  typedef typename FieldType::vector_object vobj;
+  typedef typename FieldType::scalar_object sobj;
+  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
+  BinarySimpleMunger<sobj, sobj> munge; //straight copy
+
+  //We need a 2-pass header write, first to establish the size, the second pass writes the checksum
+  std::string format = getFormatString<typename FieldType::vector_object>();
+
+  uint64_t offset; //leave 64 bits for header
+  if ( grid->IsBoss() ) { 
+    NerscIO::truncate(file);
+    offset = writeHeader(data.size(), 0, format, file);
+  }
+  grid->Broadcast(0,(void *)&offset,sizeof(offset)); //use as a barrier
+
+  std::cout << "Data offset write " << offset << std::endl;
+  std::cout << "Data size write " << data.size() << std::endl;
+  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
+  std::cout << "Field size = " << field_size << " B" << std::endl;
+
+  uint32_t checksum = 0;
+  for(int i=0;i<data.size();i++){
+    std::cout << "Data field write " << i << " offset " << offset << std::endl;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    BinaryIO::writeLatticeObject<vobj,sobj>(const_cast<FieldType &>(data[i]),file,munge,offset,format,
+					    nersc_csum,scidac_csuma,scidac_csumb);
+    offset += field_size;
+    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
+  }
+  std::cout << "Write checksum " << checksum << std::endl;
+
+  if ( grid->IsBoss() ) { 
+    writeHeader(data.size(), checksum, format, file);
+  }
+}
+
+
+template<typename FieldType>
+void readFieldArray(std::vector<FieldType> &data, const std::string &file){
+  typedef typename FieldType::vector_object vobj;
+  typedef typename FieldType::scalar_object sobj;
+  assert(data.size() > 0);
+  GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
+  BinarySimpleUnmunger<sobj, sobj> munge; //straight copy
+  
+  uint32_t hdr_checksum, hdr_size;
+  std::string format;
+  uint64_t offset = readHeader(hdr_size, hdr_checksum, format, file);
+  
+  std::cout << "Data offset read " << offset << std::endl;  
+  std::cout << "Data size read " << hdr_size << std::endl;
+  assert(data.size() == hdr_size);
+
+  uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
+
+  uint32_t checksum = 0;
+
+  for(int i=0;i<data.size();i++){
+    std::cout << "Data field read " << i << " offset " << offset << std::endl;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    BinaryIO::readLatticeObject<vobj,sobj>(data[i],file,munge,offset,format,
+					   nersc_csum,scidac_csuma,scidac_csumb);
+    offset += field_size;
+    checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2);
+  }
+
+  std::cout << "Header checksum " << hdr_checksum << std::endl;    
+  std::cout << "Read checksum " << checksum << std::endl;
+    
+
+  assert( hdr_checksum == checksum );
+}
+
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  const int Ls=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt, simd_layout, mpi_layout);
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  typedef DomainWallFermionD::FermionField FermionField;
+
+  int nfield = 20;
+  std::vector<FermionField> data(nfield, FGrid);
+
+  for(int i=0;i<data.size();i++)
+    gaussian(RNG5, data[i]);
+  
+  std::string file = "test_field_array_io.0";
+  writeFieldArray(file, data);
+
+  std::vector<FermionField> data_r(nfield, FGrid);
+  readFieldArray(data_r, file);
+  
+  for(int i=0;i<nfield;i++){
+    FermionField diff = data_r[i] - data[i];
+    RealD norm_diff = norm2(diff);
+    std::cout << "Norm2 of difference between stored and loaded data index " << i << " : " << norm_diff << std::endl;
+  }
+  
+  std::cout << "Done" << std::endl;
+
+  Grid_finalize();
+}
@@ -147,7 +147,7 @@ int main (int argc, char ** argv)
  Complex p  = TensorRemove(Tp);
  std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;

-  Complex LinkTraceScale(1.0/vol/4.0/3.0);
+  Complex LinkTraceScale(1.0/vol/4.0/(Real)Nc);
  TComplex Tl = sum(LinkTrace);
  Complex l  = TensorRemove(Tl);
  std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;
@@ -157,8 +157,10 @@ int main (int argc, char ** argv)
  Complex ll= TensorRemove(TcP);
  std::cout<<GridLogMessage << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;

-  std::string clone2x3("./ckpoint_clone2x3.4000");
-  std::string clone3x3("./ckpoint_clone3x3.4000");
+  const string stNc   = to_string( Nc   ) ;
+  const string stNcM1 = to_string( Nc-1 ) ;
+  std::string clone2x3("./ckpoint_clone"+stNcM1+"x"+stNc+".4000");
+  std::string clone3x3("./ckpoint_clone"+stNc+"x"+stNc+".4000");

  NerscIO::writeConfiguration(Umu,clone3x3,0,precision32);
  NerscIO::writeConfiguration(Umu,clone2x3,1,precision32);
@@ -46,7 +46,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  const int Ls=8;
+  const int Ls=12;

  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;

@@ -94,13 +94,32 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
-  mCG(src_o,result_o);
-
+  double t1,t2,flops;
+  int iters;
+  for(int i=0;i<100;i++){
+    result_o = Zero();
+    t1=usecond();
+    mCG(src_o,result_o);
+    t2=usecond();
+    iters = mCG.TotalInnerIterations; //Number of inner CG iterations
+    flops = 1320.0*2*FGrid->gSites()*iters;
+    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
+    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
+  }
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  CG(HermOpEO,src_o,result_o_2);
-
-  MemoryManager::Print();
+  for(int i=0;i<100;i++){
+    result_o_2 = Zero();
+    t1=usecond();
+    CG(HermOpEO,src_o,result_o_2);
+    t2=usecond();
+    iters = CG.IterationsToComplete;
+    flops = 1320.0*2*FGrid->gSites()*iters;
+    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
+    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
+  }
+  
+  //  MemoryManager::Print();

  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
@@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;

 int main(int argc, char ** argv) {
  Grid_init(&argc, &argv);
@@ -80,7 +79,8 @@ int main(int argc, char ** argv) {
    Foo=lex;
  }

-  typedef CartesianStencil<vobj,vobj,int> Stencil;
+  typedef CartesianStencil<vobj,vobj,SimpleStencilParams> Stencil;
+  SimpleStencilParams p;
    for(int dir=0;dir<4;dir++){
      for(int disp=0;disp<Fine._fdimensions[dir];disp++){

@@ -90,7 +90,7 @@ int main(int argc, char ** argv) {
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);

-	Stencil myStencil(&Fine,npoint,0,directions,displacements,0);
+	Stencil myStencil(&Fine,npoint,0,directions,displacements,p);
 	Coordinate ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
 	  Fine.oCoorFromOindex(ocoor,o);
@@ -183,8 +183,8 @@ int main(int argc, char ** argv) {
 	std::vector<int> directions(npoint,dir);
 	std::vector<int> displacements(npoint,disp);

-	Stencil EStencil(&rbFine,npoint,Even,directions,displacements,0);
-	Stencil OStencil(&rbFine,npoint,Odd,directions,displacements,0);
+	Stencil EStencil(&rbFine,npoint,Even,directions,displacements,p);
+	Stencil OStencil(&rbFine,npoint,Odd,directions,displacements,p);

 	Coordinate ocoor(4);
 	for(int o=0;o<Fine.oSites();o++){
@@ -117,8 +117,8 @@ void runBenchmark(int* argc, char*** argv) {

  // type definitions
  typedef WilsonImpl<vCoeff_t, FundamentalRepresentation, CoeffReal> WImpl;
-  typedef WilsonCloverFermion<WImpl> WilsonCloverOperator;
-  typedef CompactWilsonCloverFermion<WImpl> CompactWilsonCloverOperator;
+  typedef WilsonCloverFermion<WImpl, CloverHelpers<WImpl>> WilsonCloverOperator;
+  typedef CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>> CompactWilsonCloverOperator;
  typedef typename WilsonCloverOperator::FermionField Fermion;
  typedef typename WilsonCloverOperator::GaugeField Gauge;

@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -42,14 +43,14 @@ directory

 using namespace std;
 using namespace Grid;
- ;
+;

 int main(int argc, char** argv) {
  Grid_init(&argc, &argv);

  std::vector<int> latt({4, 4, 4, 8});
  GridCartesian* grid = SpaceTimeGrid::makeFourDimGrid(
-      latt, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
+						       latt, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());

  GridRedBlackCartesian* rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);

@@ -60,15 +61,19 @@ int main(int argc, char** argv) {
            << std::endl;
  SU2::printGenerators();
  std::cout << "Dimension of adjoint representation: "<< SU2Adjoint::Dimension << std::endl;
+
+  // guard as this code fails to compile for Nc != 3
+#if (Nc == 3)
+    
  SU2Adjoint::printGenerators();
  SU2::testGenerators();
  SU2Adjoint::testGenerators();
-
+    
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;
  std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl;
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;
  SU3::printGenerators();
  std::cout << "Dimension of adjoint representation: "<< SU3Adjoint::Dimension << std::endl;
  SU3Adjoint::printGenerators();
@@ -111,12 +116,10 @@ int main(int argc, char** argv) {

  // AdjointRepresentation has the predefined number of colours Nc
  //  Representations<FundamentalRepresentation, AdjointRepresentation, TwoIndexSymmetricRepresentation> RepresentationTypes(grid);  
-
-  
  LatticeGaugeField U(grid), V(grid);
  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U);
  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V);
-
+    
  // Adjoint representation
  // Test group structure
  // (U_f * V_f)_r = U_r * V_r
@@ -127,17 +130,17 @@ int main(int argc, char** argv) {
    SU3::LatticeMatrix Vmu = peekLorentz(V,mu);
    pokeLorentz(UV,Umu*Vmu, mu);
  }
-
+    
  AdjRep.update_representation(UV);
  typename AdjointRep<Nc>::LatticeField UVr = AdjRep.U;  // (U_f * V_f)_r 
-
-
+    
+    
  AdjRep.update_representation(U);
  typename AdjointRep<Nc>::LatticeField Ur = AdjRep.U;  // U_r
-
+    
  AdjRep.update_representation(V);
  typename AdjointRep<Nc>::LatticeField Vr = AdjRep.U;  // V_r
-
+    
  typename AdjointRep<Nc>::LatticeField UrVr(grid);
  UrVr = Zero();
  for (int mu = 0; mu < Nd; mu++) {
@@ -145,10 +148,10 @@ int main(int argc, char** argv) {
    typename AdjointRep<Nc>::LatticeMatrix Vrmu = peekLorentz(Vr,mu);
    pokeLorentz(UrVr,Urmu*Vrmu, mu);
  }
-
+    
  typename AdjointRep<Nc>::LatticeField Diff_check = UVr - UrVr;
  std::cout << GridLogMessage << "Group structure SU("<<Nc<<") check difference (Adjoint representation) : " << norm2(Diff_check) << std::endl;
-
+    
  // Check correspondence of algebra and group transformations
  // Create a random vector
  SU3::LatticeAlgebraVector h_adj(grid);
@@ -156,32 +159,31 @@ int main(int argc, char** argv) {
  random(gridRNG,h_adj);
  h_adj = real(h_adj);
  SU_Adjoint<Nc>::AdjointLieAlgebraMatrix(h_adj,Ar);
-
+    
  // Re-extract h_adj
  SU3::LatticeAlgebraVector h_adj2(grid);
  SU_Adjoint<Nc>::projectOnAlgebra(h_adj2, Ar);
  SU3::LatticeAlgebraVector h_diff = h_adj - h_adj2;
  std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl;
-
+    
  // Exponentiate
  typename AdjointRep<Nc>::LatticeMatrix Uadj(grid);
  Uadj  = expMat(Ar, 1.0, 16);
-
+    
  typename AdjointRep<Nc>::LatticeMatrix uno(grid);
  uno = 1.0;
  // Check matrix Uadj, must be real orthogonal
  typename AdjointRep<Nc>::LatticeMatrix Ucheck = Uadj - conjugate(Uadj);
  std::cout << GridLogMessage << "Reality check: " << norm2(Ucheck)
-            << std::endl;
-
+	    << std::endl;
+    
  Ucheck = Uadj * adj(Uadj) - uno;
  std::cout << GridLogMessage << "orthogonality check 1: " << norm2(Ucheck)
-            << std::endl;
+	    << std::endl;
  Ucheck = adj(Uadj) * Uadj - uno;
  std::cout << GridLogMessage << "orthogonality check 2: " << norm2(Ucheck)
-            << std::endl;
-      
-
+	    << std::endl;
+    
  // Construct the fundamental matrix in the group
  SU3::LatticeMatrix Af(grid);
  SU3::FundamentalLieAlgebraMatrix(h_adj,Af);
@@ -193,72 +195,65 @@ int main(int argc, char** argv) {
  SU3::LatticeMatrix UnitCheck(grid);
  UnitCheck = Ufund * adj(Ufund) - uno_f;
  std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck)
-            << std::endl;
+	    << std::endl;
  UnitCheck = adj(Ufund) * Ufund - uno_f;
  std::cout << GridLogMessage << "unitarity check 2: " << norm2(UnitCheck)
-            << std::endl;
-
+	    << std::endl;

  // Tranform to the adjoint representation
  U = Zero(); // fill this with only one direction
  pokeLorentz(U,Ufund,0); // the representation transf acts on full gauge fields
-
+    
  AdjRep.update_representation(U);
  Ur = AdjRep.U;  // U_r  
  typename AdjointRep<Nc>::LatticeMatrix Ur0 = peekLorentz(Ur,0); // this should be the same as Uadj
-
+    
  typename AdjointRep<Nc>::LatticeMatrix Diff_check_mat = Ur0 - Uadj;
  std::cout << GridLogMessage << "Projections structure check group difference : " << norm2(Diff_check_mat) << std::endl;

-
-
-
  // TwoIndexRep tests
-
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;


  
  std::cout << GridLogMessage << "* eS^{ij} base for SU(2)" << std::endl;
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;
  std::cout << GridLogMessage << "Dimension of Two Index Symmetric representation: "<< SU2TwoIndexSymm::Dimension << std::endl;
  SU2TwoIndexSymm::printBase();
-      std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
-        std::cout << GridLogMessage << "Generators of Two Index Symmetric representation: "<< SU2TwoIndexSymm::Dimension << std::endl;
+  std::cout << GridLogMessage << "*********************************************"
+	    << std::endl;
+  std::cout << GridLogMessage << "Generators of Two Index Symmetric representation: "<< SU2TwoIndexSymm::Dimension << std::endl;
  SU2TwoIndexSymm::printGenerators();
-        std::cout << GridLogMessage << "Test of Two Index Symmetric Generators: "<< SU2TwoIndexSymm::Dimension << std::endl;
+  std::cout << GridLogMessage << "Test of Two Index Symmetric Generators: "<< SU2TwoIndexSymm::Dimension << std::endl;
  SU2TwoIndexSymm::testGenerators();
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;


  
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;
  std::cout << GridLogMessage << "* eAS^{ij} base for SU(2)" << std::endl;
  
  std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
+	    << std::endl;
  std::cout << GridLogMessage << "Dimension of Two Index anti-Symmetric representation: "<< SU2TwoIndexAntiSymm::Dimension << std::endl;
  SU2TwoIndexAntiSymm::printBase();
-      std::cout << GridLogMessage << "*********************************************"
-            << std::endl;
-        std::cout << GridLogMessage << "Dimension of Two Index anti-Symmetric representation: "<< SU2TwoIndexAntiSymm::Dimension << std::endl;
+  std::cout << GridLogMessage << "*********************************************"
+	    << std::endl;
+  std::cout << GridLogMessage << "Dimension of Two Index anti-Symmetric representation: "<< SU2TwoIndexAntiSymm::Dimension << std::endl;
  SU2TwoIndexAntiSymm::printGenerators();
  std::cout << GridLogMessage << "Test of Two Index anti-Symmetric Generators: "<< SU2TwoIndexAntiSymm::Dimension << std::endl;
  SU2TwoIndexAntiSymm::testGenerators();
  
-  
-  
  std::cout << GridLogMessage << "*********************************************"
-      << std::endl;
+	    << std::endl;
  std::cout << GridLogMessage << "Test for the Two Index Symmetric projectors"
-      << std::endl;
+	    << std::endl;
  // Projectors 
  SU3TwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid);
  random(gridRNG,Gauss2);
@@ -276,13 +271,13 @@ int main(int argc, char** argv) {
  SU3::LatticeAlgebraVector diff2 = ha - hb;
  std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl;
  std::cout << GridLogMessage << "*********************************************"
-      << std::endl;
+	    << std::endl;

  
-    std::cout << GridLogMessage << "*********************************************"
-      << std::endl;
+  std::cout << GridLogMessage << "*********************************************"
+	    << std::endl;
  std::cout << GridLogMessage << "Test for the Two index anti-Symmetric projectors"
-      << std::endl;
+	    << std::endl;
  // Projectors
  SU3TwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid);
  random(gridRNG,Gauss2a);
@@ -300,11 +295,11 @@ int main(int argc, char** argv) {
  SU3::LatticeAlgebraVector diff2a = ha - hb;
  std::cout << GridLogMessage << "Difference: " << norm2(diff2a) << std::endl;
  std::cout << GridLogMessage << "*********************************************"
-      << std::endl;
+	    << std::endl;
    
  
  std::cout << GridLogMessage << "Two index Symmetric: Checking Group Structure"
-      << std::endl;
+	    << std::endl;
  // Testing HMC representation classes
  TwoIndexRep< Nc, Symmetric > TIndexRep(grid);

@@ -313,7 +308,7 @@ int main(int argc, char** argv) {
  LatticeGaugeField U2(grid), V2(grid);
  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2);
  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2);
-  
+    
  LatticeGaugeField UV2(grid);
  UV2 = Zero();
  for (int mu = 0; mu < Nd; mu++) {
@@ -321,16 +316,16 @@ int main(int argc, char** argv) {
    SU3::LatticeMatrix Vmu2 = peekLorentz(V2,mu);
    pokeLorentz(UV2,Umu2*Vmu2, mu);
  }
-  
+    
  TIndexRep.update_representation(UV2);
  typename TwoIndexRep< Nc, Symmetric >::LatticeField UVr2 = TIndexRep.U;  // (U_f * V_f)_r
-  
+    
  TIndexRep.update_representation(U2);
  typename TwoIndexRep< Nc, Symmetric >::LatticeField Ur2 = TIndexRep.U;  // U_r
-  
+    
  TIndexRep.update_representation(V2);
  typename TwoIndexRep< Nc, Symmetric >::LatticeField Vr2 = TIndexRep.U;  // V_r
-  
+    
  typename TwoIndexRep< Nc, Symmetric >::LatticeField Ur2Vr2(grid);
  Ur2Vr2 = Zero();
  for (int mu = 0; mu < Nd; mu++) {
@@ -338,11 +333,11 @@ int main(int argc, char** argv) {
    typename TwoIndexRep< Nc, Symmetric >::LatticeMatrix Vrmu2 = peekLorentz(Vr2,mu);
    pokeLorentz(Ur2Vr2,Urmu2*Vrmu2, mu);
  }
-  
+    
  typename TwoIndexRep< Nc, Symmetric >::LatticeField Diff_check2 = UVr2 - Ur2Vr2;
  std::cout << GridLogMessage << "Group structure SU("<<Nc<<") check difference (Two Index Symmetric): " << norm2(Diff_check2) << std::endl;
-
-  
+    
+    
  // Check correspondence of algebra and group transformations
  // Create a random vector
  SU3::LatticeAlgebraVector h_sym(grid);
@@ -350,34 +345,31 @@ int main(int argc, char** argv) {
  random(gridRNG,h_sym);
  h_sym = real(h_sym);
  SU_TwoIndex<Nc,Symmetric>::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym);
-  
+    
  // Re-extract h_sym
  SU3::LatticeAlgebraVector h_sym2(grid);
  SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym);
  SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2;
  std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl;
-
-  
+    
  // Exponentiate
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix U2iS(grid);
  U2iS  = expMat(Ar_sym, 1.0, 16);
-  
+    
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix uno2iS(grid);
  uno2iS = 1.0;
  // Check matrix U2iS, must be real orthogonal
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ucheck2iS = U2iS - conjugate(U2iS);
  std::cout << GridLogMessage << "Reality check: " << norm2(Ucheck2iS)
-      << std::endl;
-  
+	    << std::endl;
+    
  Ucheck2iS = U2iS * adj(U2iS) - uno2iS;
  std::cout << GridLogMessage << "orthogonality check 1: " << norm2(Ucheck2iS)
-      << std::endl;
+	    << std::endl;
  Ucheck2iS = adj(U2iS) * U2iS - uno2iS;
  std::cout << GridLogMessage << "orthogonality check 2: " << norm2(Ucheck2iS)
-      << std::endl;
-  
-  
-  
+	    << std::endl;
+    
  // Construct the fundamental matrix in the group
  SU3::LatticeMatrix Af_sym(grid);
  SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym);
@@ -386,147 +378,137 @@ int main(int argc, char** argv) {
  SU3::LatticeMatrix UnitCheck2(grid);
  UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f;
  std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2)
-      << std::endl;
+	    << std::endl;
  UnitCheck2 = adj(Ufund2) * Ufund2 - uno_f;
  std::cout << GridLogMessage << "unitarity check 2: " << norm2(UnitCheck2)
-      << std::endl;
-  
-
+	    << std::endl;
+    
+    
  // Tranform to the 2Index Sym representation
  U = Zero(); // fill this with only one direction
  pokeLorentz(U,Ufund2,0); // the representation transf acts on full gauge fields
-  
+    
  TIndexRep.update_representation(U);
  Ur2 = TIndexRep.U;  // U_r  
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ur02 = peekLorentz(Ur2,0); // this should be the same as U2iS
-  
+    
  typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Diff_check_mat2 = Ur02 - U2iS;
  std::cout << GridLogMessage << "Projections structure check group difference (Two Index Symmetric): " << norm2(Diff_check_mat2) << std::endl;
-  
-
-
-
+    
  if (TwoIndexRep<Nc, AntiSymmetric >::Dimension != 1){

-  std::cout << GridLogMessage << "*********************************************"
-      << std::endl;
+    std::cout << GridLogMessage << "*********************************************"
+	      << std::endl;
    
  
-  std::cout << GridLogMessage << "Two Index anti-Symmetric: Check Group Structure"
-      << std::endl;
-  // Testing HMC representation classes
-  TwoIndexRep< Nc, AntiSymmetric > TIndexRepA(grid);
+    std::cout << GridLogMessage << "Two Index anti-Symmetric: Check Group Structure"
+	      << std::endl;
+    // Testing HMC representation classes
+    TwoIndexRep< Nc, AntiSymmetric > TIndexRepA(grid);


-  // Test group structure
-  // (U_f * V_f)_r = U_r * V_r
-  LatticeGaugeField U2A(grid), V2A(grid);
-  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
-  SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
+    // Test group structure
+    // (U_f * V_f)_r = U_r * V_r
+    LatticeGaugeField U2A(grid), V2A(grid);
+    SU3::HotConfiguration<LatticeGaugeField>(gridRNG, U2A);
+    SU3::HotConfiguration<LatticeGaugeField>(gridRNG, V2A);
  
-  LatticeGaugeField UV2A(grid);
-  UV2A = Zero();
-  for (int mu = 0; mu < Nd; mu++) {
-    SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu);
-    SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
-    pokeLorentz(UV2A,Umu2A*Vmu2A, mu);
+    LatticeGaugeField UV2A(grid);
+    UV2A = Zero();
+    for (int mu = 0; mu < Nd; mu++) {
+      SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu);
+      SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu);
+      pokeLorentz(UV2A,Umu2A*Vmu2A, mu);
+    }
+  
+    TIndexRep.update_representation(UV2A);
+    typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField UVr2A = TIndexRepA.U;  // (U_f * V_f)_r
+  
+    TIndexRep.update_representation(U2A);
+    typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Ur2A = TIndexRepA.U;  // U_r
+  
+    TIndexRep.update_representation(V2A);
+    typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Vr2A = TIndexRepA.U;  // V_r
+  
+    typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Ur2Vr2A(grid);
+    Ur2Vr2A = Zero();
+    for (int mu = 0; mu < Nd; mu++) {
+      typename TwoIndexRep< Nc, AntiSymmetric >::LatticeMatrix Urmu2A = peekLorentz(Ur2A,mu);
+      typename TwoIndexRep< Nc, AntiSymmetric >::LatticeMatrix Vrmu2A = peekLorentz(Vr2A,mu);
+      pokeLorentz(Ur2Vr2A,Urmu2A*Vrmu2A, mu);
+    }
+  
+    typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Diff_check2A = UVr2A - Ur2Vr2A;
+    std::cout << GridLogMessage << "Group structure SU("<<Nc<<") check difference (Two Index anti-Symmetric): " << norm2(Diff_check2A) << std::endl;
+
+  
+    // Check correspondence of algebra and group transformations
+    // Create a random vector
+    SU3::LatticeAlgebraVector h_Asym(grid);
+    typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid);
+    random(gridRNG,h_Asym);
+    h_Asym = real(h_Asym);
+    SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym);
+  
+    // Re-extract h_sym
+    SU3::LatticeAlgebraVector h_Asym2(grid);
+    SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym);
+    SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
+    std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl;
+
+  
+    // Exponentiate
+    typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix U2iAS(grid);
+    U2iAS  = expMat(Ar_Asym, 1.0, 16);
+  
+    typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix uno2iAS(grid);
+    uno2iAS = 1.0;
+    // Check matrix U2iS, must be real orthogonal
+    typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ucheck2iAS = U2iAS - conjugate(U2iAS);
+    std::cout << GridLogMessage << "Reality check: " << norm2(Ucheck2iAS)
+	      << std::endl;
+  
+    Ucheck2iAS = U2iAS * adj(U2iAS) - uno2iAS;
+    std::cout << GridLogMessage << "orthogonality check 1: " << norm2(Ucheck2iAS)
+	      << std::endl;
+    Ucheck2iAS = adj(U2iAS) * U2iAS - uno2iAS;
+    std::cout << GridLogMessage << "orthogonality check 2: " << norm2(Ucheck2iAS)
+	      << std::endl;
+  
+  
+  
+    // Construct the fundamental matrix in the group
+    SU3::LatticeMatrix Af_Asym(grid);
+    SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
+    SU3::LatticeMatrix Ufund2A(grid);
+    Ufund2A  = expMat(Af_Asym, 1.0, 16);
+    SU3::LatticeMatrix UnitCheck2A(grid);
+    UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f;
+    std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A)
+	      << std::endl;
+    UnitCheck2A = adj(Ufund2A) * Ufund2A - uno_f;
+    std::cout << GridLogMessage << "unitarity check 2: " << norm2(UnitCheck2A)
+	      << std::endl;
+  
+
+    // Tranform to the 2Index Sym representation
+    U = Zero(); // fill this with only one direction
+    pokeLorentz(U,Ufund2A,0); // the representation transf acts on full gauge fields
+  
+    TIndexRepA.update_representation(U);
+    Ur2A = TIndexRepA.U;  // U_r  
+    typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ur02A = peekLorentz(Ur2A,0); // this should be the same as U2iS
+  
+    typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Diff_check_mat2A = Ur02A - U2iAS;
+    std::cout << GridLogMessage << "Projections structure check group difference (Two Index anti-Symmetric): " << norm2(Diff_check_mat2A) << std::endl;
+  
+  } else  {
+    std::cout << GridLogMessage << "Skipping Two Index anti-Symmetric tests "
+      "because representation is trivial (dim = 1)"
+	      << std::endl;
  }
-  
-  TIndexRep.update_representation(UV2A);
-  typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField UVr2A = TIndexRepA.U;  // (U_f * V_f)_r
-  
-  TIndexRep.update_representation(U2A);
-  typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Ur2A = TIndexRepA.U;  // U_r
-  
-  TIndexRep.update_representation(V2A);
-  typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Vr2A = TIndexRepA.U;  // V_r
-  
-  typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Ur2Vr2A(grid);
-  Ur2Vr2A = Zero();
-  for (int mu = 0; mu < Nd; mu++) {
-    typename TwoIndexRep< Nc, AntiSymmetric >::LatticeMatrix Urmu2A = peekLorentz(Ur2A,mu);
-    typename TwoIndexRep< Nc, AntiSymmetric >::LatticeMatrix Vrmu2A = peekLorentz(Vr2A,mu);
-    pokeLorentz(Ur2Vr2A,Urmu2A*Vrmu2A, mu);
-  }
-  
-  typename TwoIndexRep< Nc, AntiSymmetric >::LatticeField Diff_check2A = UVr2A - Ur2Vr2A;
-  std::cout << GridLogMessage << "Group structure SU("<<Nc<<") check difference (Two Index anti-Symmetric): " << norm2(Diff_check2A) << std::endl;
-
-  
-  // Check correspondence of algebra and group transformations
-  // Create a random vector
-  SU3::LatticeAlgebraVector h_Asym(grid);
-  typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid);
-  random(gridRNG,h_Asym);
-  h_Asym = real(h_Asym);
-  SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym);
-  
-  // Re-extract h_sym
-  SU3::LatticeAlgebraVector h_Asym2(grid);
-  SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym);
-  SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2;
-  std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl;
-
-  
-  // Exponentiate
-  typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix U2iAS(grid);
-  U2iAS  = expMat(Ar_Asym, 1.0, 16);
-  
-  typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix uno2iAS(grid);
-  uno2iAS = 1.0;
-  // Check matrix U2iS, must be real orthogonal
-  typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ucheck2iAS = U2iAS - conjugate(U2iAS);
-  std::cout << GridLogMessage << "Reality check: " << norm2(Ucheck2iAS)
-      << std::endl;
-  
-  Ucheck2iAS = U2iAS * adj(U2iAS) - uno2iAS;
-  std::cout << GridLogMessage << "orthogonality check 1: " << norm2(Ucheck2iAS)
-      << std::endl;
-  Ucheck2iAS = adj(U2iAS) * U2iAS - uno2iAS;
-  std::cout << GridLogMessage << "orthogonality check 2: " << norm2(Ucheck2iAS)
-      << std::endl;
-  
-  
-  
-  // Construct the fundamental matrix in the group
-  SU3::LatticeMatrix Af_Asym(grid);
-  SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym);
-  SU3::LatticeMatrix Ufund2A(grid);
-  Ufund2A  = expMat(Af_Asym, 1.0, 16);
-  SU3::LatticeMatrix UnitCheck2A(grid);
-  UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f;
-  std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A)
-      << std::endl;
-  UnitCheck2A = adj(Ufund2A) * Ufund2A - uno_f;
-  std::cout << GridLogMessage << "unitarity check 2: " << norm2(UnitCheck2A)
-      << std::endl;
-  
-
-  // Tranform to the 2Index Sym representation
-  U = Zero(); // fill this with only one direction
-  pokeLorentz(U,Ufund2A,0); // the representation transf acts on full gauge fields
-  
-  TIndexRepA.update_representation(U);
-  Ur2A = TIndexRepA.U;  // U_r  
-  typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ur02A = peekLorentz(Ur2A,0); // this should be the same as U2iS
-  
-  typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Diff_check_mat2A = Ur02A - U2iAS;
-  std::cout << GridLogMessage << "Projections structure check group difference (Two Index anti-Symmetric): " << norm2(Diff_check_mat2A) << std::endl;
-  
-} else  {
-  std::cout << GridLogMessage << "Skipping Two Index anti-Symmetric tests "
-                                 "because representation is trivial (dim = 1)"
-            << std::endl;
-}
-
-
-
-
-
-
-
-

+#endif
  
  Grid_finalize();
 }
@@ -122,14 +122,15 @@ int main (int argc, char ** argv)
  std::cout << "Determinant defect before projection " <<norm2(detU)<<std::endl;
  tmp = U*adj(U) - ident;
  std::cout << "Unitarity check before projection    " << norm2(tmp)<<std::endl; 
-  
+#if (Nc == 3)
  ProjectSU3(U);
  detU= Determinant(U) ;
  detU= detU -1.0;
  std::cout << "Determinant ProjectSU3 defect " <<norm2(detU)<<std::endl;
  tmp = U*adj(U) - ident;
  std::cout << "Unitarity check after projection    " << norm2(tmp)<<std::endl; 
-
+#endif
+  
  ProjectSUn(UU);
  detUU= Determinant(UU);
  detUU= detUU -1.0;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	8cc3c522c3	Merge pull request #409 from giltirn/feature/dirichlet-gparity-stage Import round 5	2022-08-31 18:22:50 -04:00
Christopher Kelly	33e4a0caee	Imported changes from feature/gparity_HMC branch: Rework of WilsonFlow class Fixed logic error in smear method where the step index was initialized to 1 rather than 0, resulting in the logged output value of tau being too large by epsilon Previously smear_adaptive would maintain the current value of tau as a class member variable whereas smear would compute it separately; now both methods maintain the current value internally and it is updated by the evolve_step routines. Both evolve methods are now const. smear_adaptive now also maintains the current value of epsilon internally, allowing it to be a const method and also allowing the same class instance to be reused without needing to be reset Replaced the fixed evaluation of the plaquette energy density and plaquette topological charge during the smearing with a highly flexible general strategy where the user can add arbitrary measurements as functional objects that are evaluated at an arbitrary frequency By default the same plaquette-based measurements are performed, but additional example functions are provided where the smearing is performed with different choices of measurement that are returned as an array for further processing Added a method to compute the energy density using the Cloverleaf approach which has smaller discretization errors Added a new tensor utility operation, copyLane, which allows for the copying of a single SIMD lane between two instances of the same tensor type but potentially different precisions To LocalCoherenceLanczos, added the option to compute the high/low eval of the fine operator on every restart to aid in tuning the Chebyshev Added Test_field_array_io which demonstrates and tests a single-file write of an arbitrary array of fields Added Test_evec_compression which generates evecs using Lanczos and attempts to compress them using the local coherence technique Added Test_compressed_lanczos_gparity which demonstrates the local coherence Lanczos for G-parity BCs Added HMC main programs for the 40ID and 48ID G-parity lattices	2022-07-01 14:12:12 -04:00
Peter Boyle	1f903d9296	Merge branch 'feature/dirichlet' into feature/dirichlet-gparity	2022-07-01 12:12:50 -04:00
Peter Boyle	4df1e0987f	Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity	2022-07-01 09:55:43 -04:00
Peter Boyle	588c2f3cb1	Faster axpy_norm and innerProduct	2022-07-01 09:44:58 -04:00
Peter Boyle	bd99fd608c	Introduce a non-default stream for compute operatoins	2022-07-01 09:42:53 -04:00
Peter Boyle	57b442d0de	Log memory operations	2022-07-01 09:42:17 -04:00
Peter Boyle	751a4562d7	Timing improvement	2022-07-01 09:41:43 -04:00
Peter Boyle	ca66301dee	Remove debug	2022-06-30 14:53:12 -04:00
Peter Boyle	808bb59206	Mixed prec DD-RHMC	2022-06-30 13:50:09 -04:00
Peter Boyle	4b7f51d19d	Create a new RNG file	2022-06-30 13:49:50 -04:00
Peter Boyle	d03152fac4	New file under debug	2022-06-30 13:49:35 -04:00
Peter Boyle	137f190258	Dirichlet implementation	2022-06-30 13:45:07 -04:00
Peter Boyle	53d01312b3	Rough flop counting, need to add M5D, M5Ddag, MooeeInv flops	2022-06-30 13:44:09 -04:00
Peter Boyle	220050822a	Speed up M5D and M5Ddag	2022-06-30 13:43:27 -04:00
Peter Boyle	87ad76d81b	Initialise timeval	2022-06-30 13:42:46 -04:00
Peter Boyle	4ac1094856	Updated config commands	2022-06-27 12:16:24 -04:00
Peter Boyle	d44a57b0af	Allow frequency=0 to disable	2022-06-27 12:15:55 -04:00
Peter Boyle	dc000d10ee	Spelling correction	2022-06-27 12:14:57 -04:00
Peter Boyle	3685f391cf	More verbose CG	2022-06-27 12:11:08 -04:00
Peter Boyle	efd7338a00	Allow dirichlet at round the world link	2022-06-27 12:10:27 -04:00
Peter Boyle	e1e7b1e224	RNG fix	2022-06-27 12:09:52 -04:00
Peter Boyle	8208a6214f	Merge branch 'feature/dirichlet-gparity' into feature/dirichlet	2022-06-15 19:23:48 -04:00
Peter Boyle	3d8146b596	Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity	2022-06-15 19:20:27 -04:00
Peter Boyle	31efa5c4da	Script updates for current summit	2022-06-15 19:19:44 -04:00
Peter Boyle	d10d30dda8	Script update	2022-06-15 19:18:58 -04:00
Peter Boyle	0e9666bc92	Test update	2022-06-15 19:18:42 -04:00
Peter Boyle	6efd80f104	Printing	2022-06-15 18:23:46 -04:00
Peter Boyle	fdef7a1a8c	Dirichlet fix	2022-06-15 00:05:20 -04:00
Peter Boyle	501bb117bf	Const correct	2022-06-15 00:04:09 -04:00
Peter Boyle	05ca7dc252	Const correctness	2022-06-14 23:41:05 -04:00
Peter Boyle	e9648a1635	Useful periodic print. CG convergence bound is remarkably accurate on low eigenvalue in numerical tests	2022-06-14 23:40:04 -04:00
Peter Boyle	57bd0a0a22	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet	2022-06-01 19:29:38 -04:00
Peter Boyle	b49db84b08	Slurm updates	2022-06-01 19:27:42 -04:00
Peter Boyle	583f7c52f3	SSC mark	2022-06-01 19:27:29 -04:00
Peter Boyle	58a86c9164	SSC mark removal	2022-06-01 19:27:06 -04:00
Peter Boyle	a25b32847f	Crusher patch	2022-06-01 19:26:37 -04:00
Peter Boyle	6f1a2e132b	SSC mark causing problems	2022-06-01 19:26:06 -04:00
Peter Boyle	b1ede7b46d	Faster RNG init	2022-06-01 19:25:42 -04:00
Peter Boyle	e762c940c2	Reduce the loop over exterior for GPU to indirection table	2022-06-01 14:29:25 -07:00
Peter Boyle	6a1a198144	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet	2022-05-29 11:08:09 -04:00
Peter Boyle	34faa39f4f	Clean up Dirichlet. Big oops fix	2022-05-28 17:18:08 -07:00
Peter Boyle	5ddea3829d	Extra easier signature for peek	2022-05-28 15:52:39 -07:00
Peter Boyle	7eb29cf529	MPI fix	2022-05-28 15:51:34 -07:00
Peter Boyle	f729b9b889	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet	2022-05-25 14:16:09 -04:00
Peter Boyle	4f997c5f04	Remove extra face kernels in Dirichlet	2022-05-25 11:15:25 -07:00
Peter Boyle	60f4cb0ffd	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet	2022-05-25 12:38:10 -04:00
Peter Boyle	136d843ce7	Crusher updates	2022-05-25 12:36:09 -04:00
Peter Boyle	18028f4309	Merge branch 'develop' into feature/dirichlet	2022-05-24 18:26:18 -07:00
Peter Boyle	5164016740	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2022-05-24 18:25:57 -07:00
Peter Boyle	d83beaa890	Update perlmutter	2022-05-24 18:25:00 -07:00
Peter Boyle	f9f05e995b	Update perlmutter	2022-05-24 18:24:38 -07:00
Peter Boyle	e651b9e7ab	Clean up stencil with better intranode Dirichlet / DDHMC support. 14TF/s on a Perlmutter node	2022-05-24 18:23:39 -07:00
Peter Boyle	47b4e91473	Verbose change	2022-05-24 18:19:18 -07:00
Peter Boyle	3f31afa4fc	Clean up verbose	2022-05-24 18:18:51 -07:00
Peter Boyle	af3b065add	Merge pull request #403 from fjosw/fix/cuda_11_5_warnings Fixed nvcc 11.5+ warnings	2022-05-24 11:10:02 -04:00
fjosw	7937ac2bab	fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in pugixml/pugixml.cc	2022-05-24 15:31:03 +01:00
fjosw	e909aeedf0	fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in Grid_Eigen_Dense.h	2022-05-24 15:29:42 +01:00
fjosw	bab8aa8eb0	fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in DisableWarnings.h	2022-05-24 15:27:40 +01:00
Peter Boyle	38b22f05be	Merge pull request #402 from fjosw/fix/clover_warnings fixed clover warnings	2022-05-24 10:05:27 -04:00
fjosw	617c5362c1	fix: fixed warning: missing return statement at end of non-void function in CloverHelpers	2022-05-24 11:37:33 +01:00
Peter Boyle	083b58e66d	Merge pull request #401 from JPRichings/LocalCoheranceDeflation Local coherance batch deflation	2022-05-20 11:44:22 -04:00
Peter Boyle	633427a2df	Merge pull request #400 from JPRichings/wilson_sweep bench wilson sweep fix	2022-05-20 11:43:40 -04:00
JPRichings	2031d6910a	Merge branch 'paboyle:develop' into wilson_sweep	2022-05-20 16:20:23 +01:00
Peter Boyle	f82ce67624	Dirichlet improved	2022-05-19 19:17:11 -07:00
Peter Boyle	b52e8ef65a	Dirichlet changes	2022-05-19 16:45:41 -07:00
Peter Boyle	2594e3c230	Dirichlet option	2022-05-19 16:45:19 -07:00
Peter Boyle	8cedb45af2	Dirichlet BCs	2022-05-19 16:45:02 -07:00
Peter Boyle	aa008cbe99	Updated for new Dirichlet interface	2022-05-19 16:44:39 -07:00
JPRichings	79e34b3eb4	Local Coherence batch deflation	2022-05-19 14:53:17 +01:00
JPRichings	4f3d581ab4	Merge branch 'paboyle:develop' into LocalCoheranceDeflation	2022-05-19 14:46:17 +01:00
Peter Boyle	6fb6ca5b6b	Merge branch 'develop' into feature/dirichlet	2022-05-17 09:09:00 -07:00
Peter Boyle	b8ee19691c	Updated config for PM	2022-05-17 09:08:12 -07:00
Peter Boyle	d16427b837	Merge pull request #399 from fjosw/fix/Nc_neq_3 fix: assert for dimensions of compact Wilson clover moved to constructor	2022-05-17 09:03:42 -04:00
JPRichings	4b1997e2f3	wilson sweep test	2022-05-16 15:58:33 +01:00
JPRichings	8939d5dc73	bugfix: eo operator called in correct location	2022-05-16 00:28:28 +01:00
JPRichings	b051e00de0	Additional Local Coherance Deflation operator()	2022-05-16 00:25:13 +01:00
fjosw	8aa75b492f	Merge branch 'develop' into fix/Nc_neq_3	2022-05-10 14:22:03 +01:00
Peter Boyle	0274f40686	Merge pull request #389 from mbruno46/mbruno-eclover Feature/expClover	2022-05-10 09:18:19 -04:00
Peter Boyle	77aa147ce5	Merge branch 'develop' into mbruno-eclover	2022-05-10 09:16:53 -04:00
fjosw	32facbd02a	fix: assert for dimensions of compact Wilson clover moved to constructor.	2022-05-10 10:53:22 +01:00
Peter Boyle	4de50ab146	Merge pull request #396 from fjosw/fix/readd_config.h fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am	2022-05-09 08:26:48 -04:00
fjosw	8b12a61097	fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am	2022-05-09 11:53:22 +01:00
Peter Boyle	79ea027c0b	Merge pull request #377 from RJHudspith/develop NERSC and ILDG for non-SU(3) configuration checkpoints	2022-05-03 08:55:48 -04:00
Peter Boyle	62339d437f	Merge pull request #387 from lehner/feature/gpt Parity mass terms for domain wall fermions to enable 4d eofa	2022-05-03 08:52:18 -04:00
Peter Boyle	698e745276	Merge pull request #390 from fjosw/feature/conserved_current_wilson Conserved current for wilson fermions	2022-05-03 08:51:10 -04:00
Peter Boyle	9a6e2c315d	Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge SteepestDescentGaugeFix now exits when the algorithm does not converge.	2022-05-03 08:49:26 -04:00
fjosw	e61fed87db	SteepestDescentGaugeFix now exits when the algorithm does not converge. This behaviour can be altered by setting err_on_no_converge to false.	2022-04-20 15:41:55 +01:00
fjosw	b8bc560b51	Test_wilson_conserved_current implemented, all 5d references removed.	2022-04-05 17:33:45 +01:00
fjosw	6bc2483d57	Merge branch 'feature/eclover' into feature/conserved_current_wilson	2022-04-05 15:26:49 +01:00
fjosw	82aecbf4cf	Test_wilson_conserved_current added	2022-04-05 15:26:39 +01:00
Mattia Bruno	ee23a76aa0	Merge pull request #2 from fjosw/feature/eclover Feature/eclover	2022-04-05 13:30:13 +02:00
fjosw	d7191e5a02	SeqConservedCurrent implemented for Wilson fermions	2022-04-05 11:48:56 +01:00
fjosw	c8a824425b	Error message added if another conserved current than vector is requested for Wilson type fermions.	2022-04-05 10:58:22 +01:00
fjosw	f23626a6b8	End scope by additional block in CloverHelpers.h	2022-04-02 16:08:15 +01:00
fjosw	6577a03d16	Explcitly closed views in Exponentiate_Clover	2022-04-01 18:39:12 +01:00
fjosw	427c8695fe	Change signs and prefactors for conserved current to mimic the 5d version.	2022-04-01 16:20:21 +01:00
fjosw	9e82c468ab	Multiplication of diagonal mass in exponentiate fixed for gpus	2022-04-01 15:54:43 +01:00
fjosw	603fd96747	Missing link multiplication added.	2022-04-01 10:58:56 +01:00
fjosw	fe993c0836	/=2 replaced by *=0.5	2022-03-31 17:08:17 +01:00
fjosw	cdf31d52c1	GaugeGrid and typo fixed	2022-03-31 17:04:35 +01:00
fjosw	0542eaf1da	First version of conserved current contraction for Wilson type quarks	2022-03-31 17:02:09 +01:00
Christoph Lehner	317bdcf158	nerscio parametrization	2022-03-24 13:10:47 +01:00
Mattia Bruno	9ca2c98882	Merge branch 'develop' of https://github.com/paboyle/Grid into mbruno-eclover	2022-03-22 15:31:37 +01:00
Mattia Bruno	53ae01a34a	Merge pull request #1 from fjosw/feature/eclover Feature/eclover	2022-03-15 15:23:35 +01:00
Christoph Lehner	76c294a7ba	open bc fix	2022-03-08 13:55:16 +01:00
fjosw	0c0c2b1e20	Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed.	2022-03-08 09:44:51 +00:00
Christoph Lehner	e2fc3a0f04	Merge pull request #28 from paboyle/develop Sync with Upstream	2022-03-08 09:58:51 +01:00
fjosw	451e7972fd	Reintroduced explicit inversion of the Clover term in case of the CompactExpClover because of the open boundary O(a) improvement. Changed the timing output to GridLogDebug	2022-03-07 17:43:33 +00:00
fjosw	56c089d347	Removed leftover comments	2022-03-07 16:40:20 +00:00
fjosw	acf740e44d	Merge pull request #1 from FelixPGZiegler/feature/eclover Feature/eclover	2022-03-07 16:25:11 +00:00
fziegler	182f513404	Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover	2022-03-07 15:22:04 +00:00
fziegler	d5b2323a57	included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover	2022-03-07 14:44:24 +00:00
FelixPGZiegler	bad18d4417	Merge branch 'paboyle:develop' into feature/eclover	2022-03-07 13:54:10 +00:00
fjosw	438caab25f	generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed.	2022-02-27 18:27:18 +00:00
fjosw	239e2c1ee6	tests: wilson clover cg tests now include compact variant as well as exponential wilson clover operators	2022-02-27 18:26:34 +00:00
fjosw	013dc2ef33	tests: core tests for wilson clover and wilson exp clover including compact version extended/added	2022-02-27 18:13:47 +00:00
Christoph Lehner	9616811c3d	Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt	2022-02-24 22:03:05 +01:00
Christoph Lehner	8a3002c03b	separate left and right masses for CayleyFermion5D	2022-02-24 22:02:56 +01:00
Mattia Bruno	71034f828e	attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig	2022-02-23 01:02:27 +01:00
Mattia Bruno	11437930c5	cleaned up definitions of wilsonclover fermions	2022-02-22 10:45:16 +01:00
Mattia Bruno	3d44aa9cb9	cleaned up cloverhelpers; fixed test compact_clover which runs	2022-02-22 01:10:19 +01:00
Mattia Bruno	2851870d70	expClover support via helpers template class	2022-02-22 00:05:43 +01:00
RJHudspith	0bd83cdbda	Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled.	2021-11-28 21:51:03 +01:00